diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,90260 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999806054964023, + "eval_steps": 500, + "global_step": 12890, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 7.519832075963106, + "learning_rate": 5.16795865633075e-08, + "loss": 1.0553, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 5.888495993953021, + "learning_rate": 1.03359173126615e-07, + "loss": 1.0691, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 7.777014625287366, + "learning_rate": 1.5503875968992249e-07, + "loss": 1.0115, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 8.288166885816697, + "learning_rate": 2.0671834625323e-07, + "loss": 1.0531, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.728059218918297, + "learning_rate": 2.583979328165375e-07, + "loss": 1.0227, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 6.903468059340747, + "learning_rate": 3.1007751937984497e-07, + "loss": 1.071, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 7.673274081631383, + "learning_rate": 3.6175710594315246e-07, + "loss": 1.1014, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 7.127045187299337, + "learning_rate": 4.1343669250646e-07, + "loss": 1.0198, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 8.42937660527831, + "learning_rate": 4.651162790697675e-07, + "loss": 1.0276, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 7.625662352763816, + "learning_rate": 5.16795865633075e-07, + "loss": 0.9806, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 6.336066643354788, + "learning_rate": 5.684754521963825e-07, + "loss": 1.0316, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 6.462067474116861, + "learning_rate": 6.201550387596899e-07, + "loss": 1.0035, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 5.403141428987006, + "learning_rate": 6.718346253229975e-07, + "loss": 1.0206, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 6.128595347934199, + "learning_rate": 7.235142118863049e-07, + "loss": 0.9459, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 5.395879777195401, + "learning_rate": 7.751937984496125e-07, + "loss": 0.9791, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 6.316994153124984, + "learning_rate": 8.2687338501292e-07, + "loss": 0.8712, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 5.630215579253789, + "learning_rate": 8.785529715762274e-07, + "loss": 0.9682, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 5.294689706674586, + "learning_rate": 9.30232558139535e-07, + "loss": 0.9027, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 4.422215964176046, + "learning_rate": 9.819121447028424e-07, + "loss": 0.9334, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 3.3414770939835425, + "learning_rate": 1.03359173126615e-06, + "loss": 0.9431, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.195632731985836, + "learning_rate": 1.0852713178294575e-06, + "loss": 0.934, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 2.612591981865614, + "learning_rate": 1.136950904392765e-06, + "loss": 0.9314, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 2.6314997357443173, + "learning_rate": 1.1886304909560723e-06, + "loss": 0.8186, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 2.764521836778098, + "learning_rate": 1.2403100775193799e-06, + "loss": 0.8507, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.9395074680919357, + "learning_rate": 1.2919896640826874e-06, + "loss": 0.8617, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 2.609294136301998, + "learning_rate": 1.343669250645995e-06, + "loss": 0.8471, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.4551541628476996, + "learning_rate": 1.3953488372093025e-06, + "loss": 0.8961, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 2.71237206223252, + "learning_rate": 1.4470284237726098e-06, + "loss": 0.8651, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.414596350445245, + "learning_rate": 1.4987080103359176e-06, + "loss": 0.8452, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.522648076681847, + "learning_rate": 1.550387596899225e-06, + "loss": 0.8308, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 2.482017790337925, + "learning_rate": 1.6020671834625322e-06, + "loss": 0.7951, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.694296117241535, + "learning_rate": 1.65374677002584e-06, + "loss": 0.8104, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.2735440468482206, + "learning_rate": 1.7054263565891473e-06, + "loss": 0.878, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.460717666737549, + "learning_rate": 1.7571059431524549e-06, + "loss": 0.7748, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 2.615230363724519, + "learning_rate": 1.8087855297157624e-06, + "loss": 0.8503, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.0739543624168046, + "learning_rate": 1.86046511627907e-06, + "loss": 0.7561, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.098811848895224, + "learning_rate": 1.9121447028423773e-06, + "loss": 0.7309, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.4512717663597585, + "learning_rate": 1.963824289405685e-06, + "loss": 0.8263, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.2516002262721893, + "learning_rate": 2.0155038759689923e-06, + "loss": 0.8068, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.596235532100443, + "learning_rate": 2.0671834625323e-06, + "loss": 0.8449, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.545090219663501, + "learning_rate": 2.1188630490956074e-06, + "loss": 0.8061, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 2.363567341283558, + "learning_rate": 2.170542635658915e-06, + "loss": 0.8206, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 2.163689095682903, + "learning_rate": 2.222222222222222e-06, + "loss": 0.7695, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.2080585620688122, + "learning_rate": 2.27390180878553e-06, + "loss": 0.7675, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.3873360143137075, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.7691, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.494632490226066, + "learning_rate": 2.3772609819121447e-06, + "loss": 0.7957, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.244123945182519, + "learning_rate": 2.4289405684754527e-06, + "loss": 0.7594, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 2.277922201876151, + "learning_rate": 2.4806201550387598e-06, + "loss": 0.7248, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.0354233345018833, + "learning_rate": 2.5322997416020673e-06, + "loss": 0.7247, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 2.1956830238083183, + "learning_rate": 2.583979328165375e-06, + "loss": 0.7942, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 2.1086478781492746, + "learning_rate": 2.635658914728683e-06, + "loss": 0.7626, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 2.212343665165485, + "learning_rate": 2.68733850129199e-06, + "loss": 0.7707, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.185894513201737, + "learning_rate": 2.7390180878552975e-06, + "loss": 0.7752, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 2.391570359350091, + "learning_rate": 2.790697674418605e-06, + "loss": 0.7444, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 2.625443466510659, + "learning_rate": 2.842377260981912e-06, + "loss": 0.7962, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 1.8983609533379908, + "learning_rate": 2.8940568475452197e-06, + "loss": 0.7051, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 2.1952030249616508, + "learning_rate": 2.9457364341085276e-06, + "loss": 0.7571, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 2.077316241532325, + "learning_rate": 2.997416020671835e-06, + "loss": 0.7287, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 2.227086591108584, + "learning_rate": 3.0490956072351423e-06, + "loss": 0.7853, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 2.2554648050155466, + "learning_rate": 3.10077519379845e-06, + "loss": 0.7785, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 2.1596438471020365, + "learning_rate": 3.1524547803617574e-06, + "loss": 0.8295, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 2.282697453560096, + "learning_rate": 3.2041343669250645e-06, + "loss": 0.7639, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 1.758532025047504, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.687, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 2.2299768870913597, + "learning_rate": 3.30749354005168e-06, + "loss": 0.7668, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.111093569844413, + "learning_rate": 3.3591731266149875e-06, + "loss": 0.8231, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 2.1534840631827485, + "learning_rate": 3.4108527131782946e-06, + "loss": 0.7143, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 2.216368458223642, + "learning_rate": 3.462532299741602e-06, + "loss": 0.7461, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.3280494216837937, + "learning_rate": 3.5142118863049097e-06, + "loss": 0.7476, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.241321356552836, + "learning_rate": 3.5658914728682177e-06, + "loss": 0.7498, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 2.1221608661794136, + "learning_rate": 3.617571059431525e-06, + "loss": 0.7522, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.0474193518921426, + "learning_rate": 3.6692506459948323e-06, + "loss": 0.8124, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.9518444288280057, + "learning_rate": 3.72093023255814e-06, + "loss": 0.7243, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.1636466718184812, + "learning_rate": 3.772609819121447e-06, + "loss": 0.7377, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.856977439681704, + "learning_rate": 3.8242894056847545e-06, + "loss": 0.7054, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.0053756710099635, + "learning_rate": 3.875968992248063e-06, + "loss": 0.7338, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 2.158365731996466, + "learning_rate": 3.92764857881137e-06, + "loss": 0.7236, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 2.4102976060464267, + "learning_rate": 3.979328165374677e-06, + "loss": 0.7754, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.17610689256887, + "learning_rate": 4.031007751937985e-06, + "loss": 0.7185, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.0942309382339497, + "learning_rate": 4.082687338501292e-06, + "loss": 0.6833, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.2703418958195645, + "learning_rate": 4.1343669250646e-06, + "loss": 0.7626, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.3007004210388113, + "learning_rate": 4.186046511627907e-06, + "loss": 0.7403, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.8774740106252445, + "learning_rate": 4.237726098191215e-06, + "loss": 0.7147, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.5136139217593367, + "learning_rate": 4.289405684754522e-06, + "loss": 0.7195, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 2.1863608527720064, + "learning_rate": 4.34108527131783e-06, + "loss": 0.7224, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 2.2834091419898668, + "learning_rate": 4.3927648578811375e-06, + "loss": 0.7566, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 2.386091234092043, + "learning_rate": 4.444444444444444e-06, + "loss": 0.7626, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 2.3674971878373423, + "learning_rate": 4.4961240310077525e-06, + "loss": 0.6586, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 2.172996505874935, + "learning_rate": 4.54780361757106e-06, + "loss": 0.6604, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.9070537311083688, + "learning_rate": 4.599483204134368e-06, + "loss": 0.6725, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.9524328607599153, + "learning_rate": 4.651162790697675e-06, + "loss": 0.7146, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.0347277873038725, + "learning_rate": 4.702842377260982e-06, + "loss": 0.7531, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.209982622000607, + "learning_rate": 4.754521963824289e-06, + "loss": 0.7188, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.9692459087380176, + "learning_rate": 4.806201550387598e-06, + "loss": 0.6813, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.1049024850181413, + "learning_rate": 4.857881136950905e-06, + "loss": 0.743, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.1798186450732397, + "learning_rate": 4.909560723514212e-06, + "loss": 0.7513, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 2.1645572248346174, + "learning_rate": 4.9612403100775195e-06, + "loss": 0.7533, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.0228318887762877, + "learning_rate": 5.012919896640828e-06, + "loss": 0.6592, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.206621682303021, + "learning_rate": 5.064599483204135e-06, + "loss": 0.7562, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.1311466852020433, + "learning_rate": 5.116279069767442e-06, + "loss": 0.7061, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.9809595587086415, + "learning_rate": 5.16795865633075e-06, + "loss": 0.7214, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.9387466972711545, + "learning_rate": 5.219638242894057e-06, + "loss": 0.7502, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.8229280598602298, + "learning_rate": 5.271317829457366e-06, + "loss": 0.6942, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.8532957824955443, + "learning_rate": 5.322997416020672e-06, + "loss": 0.732, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.9006885008912018, + "learning_rate": 5.37467700258398e-06, + "loss": 0.6513, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.9933017382249472, + "learning_rate": 5.4263565891472865e-06, + "loss": 0.7436, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.0787408819589306, + "learning_rate": 5.478036175710595e-06, + "loss": 0.6851, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.9692236315472937, + "learning_rate": 5.529715762273902e-06, + "loss": 0.7102, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 2.0958519249073144, + "learning_rate": 5.58139534883721e-06, + "loss": 0.6779, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 2.1179506397493193, + "learning_rate": 5.6330749354005176e-06, + "loss": 0.7571, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.9953179391474662, + "learning_rate": 5.684754521963824e-06, + "loss": 0.6988, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.209611906437212, + "learning_rate": 5.736434108527133e-06, + "loss": 0.7277, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.9401680204051799, + "learning_rate": 5.788113695090439e-06, + "loss": 0.6803, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.1188809424704016, + "learning_rate": 5.839793281653747e-06, + "loss": 0.7199, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.0943100592990525, + "learning_rate": 5.891472868217055e-06, + "loss": 0.6849, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 2.2713837146185636, + "learning_rate": 5.943152454780362e-06, + "loss": 0.7577, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.127209132176853, + "learning_rate": 5.99483204134367e-06, + "loss": 0.7871, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 2.034930723731037, + "learning_rate": 6.046511627906977e-06, + "loss": 0.7148, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.136418574307478, + "learning_rate": 6.0981912144702846e-06, + "loss": 0.6921, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.007500412728349, + "learning_rate": 6.149870801033592e-06, + "loss": 0.6471, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.0923841205992715, + "learning_rate": 6.2015503875969e-06, + "loss": 0.7567, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.266590622953997, + "learning_rate": 6.253229974160208e-06, + "loss": 0.6969, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 2.061399223889906, + "learning_rate": 6.304909560723515e-06, + "loss": 0.7352, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.9181609203148584, + "learning_rate": 6.356589147286822e-06, + "loss": 0.6856, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 2.1843507167792544, + "learning_rate": 6.408268733850129e-06, + "loss": 0.6914, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.226492897418139, + "learning_rate": 6.459948320413437e-06, + "loss": 0.7682, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.9479308374334845, + "learning_rate": 6.511627906976745e-06, + "loss": 0.6866, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.9634312430850824, + "learning_rate": 6.563307493540052e-06, + "loss": 0.6599, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.0373930546127754, + "learning_rate": 6.61498708010336e-06, + "loss": 0.6946, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.9244260526837411, + "learning_rate": 6.666666666666667e-06, + "loss": 0.7157, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.883453956330838, + "learning_rate": 6.718346253229975e-06, + "loss": 0.6764, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.0045545692999402, + "learning_rate": 6.7700258397932826e-06, + "loss": 0.6809, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.9606349757302202, + "learning_rate": 6.821705426356589e-06, + "loss": 0.7321, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.8672674952014412, + "learning_rate": 6.873385012919898e-06, + "loss": 0.6396, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.8517558322844003, + "learning_rate": 6.925064599483204e-06, + "loss": 0.6908, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.0440099335072635, + "learning_rate": 6.976744186046513e-06, + "loss": 0.7415, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.979158649093718, + "learning_rate": 7.028423772609819e-06, + "loss": 0.6901, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.9189054916948147, + "learning_rate": 7.080103359173127e-06, + "loss": 0.6229, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.8979967649651344, + "learning_rate": 7.131782945736435e-06, + "loss": 0.6727, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 2.089207860872488, + "learning_rate": 7.183462532299742e-06, + "loss": 0.6474, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 2.05975957356314, + "learning_rate": 7.23514211886305e-06, + "loss": 0.7032, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 2.1621687001374346, + "learning_rate": 7.286821705426357e-06, + "loss": 0.7346, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.029956346991461, + "learning_rate": 7.338501291989665e-06, + "loss": 0.7053, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.9328938768505268, + "learning_rate": 7.390180878552973e-06, + "loss": 0.6723, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 2.4819536225184136, + "learning_rate": 7.44186046511628e-06, + "loss": 0.73, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 2.2634611122156376, + "learning_rate": 7.493540051679587e-06, + "loss": 0.7043, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 2.03845568283746, + "learning_rate": 7.545219638242894e-06, + "loss": 0.7747, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 2.170724138477728, + "learning_rate": 7.596899224806202e-06, + "loss": 0.7935, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 2.0504183784244647, + "learning_rate": 7.648578811369509e-06, + "loss": 0.6595, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.8734228494690135, + "learning_rate": 7.700258397932817e-06, + "loss": 0.7251, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 2.1235225253454257, + "learning_rate": 7.751937984496126e-06, + "loss": 0.6747, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 2.1341870248884347, + "learning_rate": 7.803617571059433e-06, + "loss": 0.6682, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 2.1759810022127875, + "learning_rate": 7.85529715762274e-06, + "loss": 0.6936, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 2.1956170030421633, + "learning_rate": 7.906976744186048e-06, + "loss": 0.7492, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 2.483555592654495, + "learning_rate": 7.958656330749354e-06, + "loss": 0.7305, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 2.116766405634247, + "learning_rate": 8.010335917312663e-06, + "loss": 0.6669, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.9162855045739617, + "learning_rate": 8.06201550387597e-06, + "loss": 0.7261, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 2.054963532098712, + "learning_rate": 8.113695090439278e-06, + "loss": 0.7314, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 2.028434092710246, + "learning_rate": 8.165374677002584e-06, + "loss": 0.7294, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 2.048480270719039, + "learning_rate": 8.217054263565893e-06, + "loss": 0.7383, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 2.2176711992410225, + "learning_rate": 8.2687338501292e-06, + "loss": 0.7124, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 2.069919880039789, + "learning_rate": 8.320413436692508e-06, + "loss": 0.7016, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 2.205058554739053, + "learning_rate": 8.372093023255815e-06, + "loss": 0.6351, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 2.1284296744738613, + "learning_rate": 8.423772609819121e-06, + "loss": 0.6765, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 1.9087997643879109, + "learning_rate": 8.47545219638243e-06, + "loss": 0.7027, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 2.1452848850815083, + "learning_rate": 8.527131782945736e-06, + "loss": 0.7407, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.9991766307667835, + "learning_rate": 8.578811369509045e-06, + "loss": 0.6794, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 1.9138380269362316, + "learning_rate": 8.630490956072353e-06, + "loss": 0.6835, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 2.003442543319759, + "learning_rate": 8.68217054263566e-06, + "loss": 0.6293, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.95394526232576, + "learning_rate": 8.733850129198968e-06, + "loss": 0.6773, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 2.088166035828683, + "learning_rate": 8.785529715762275e-06, + "loss": 0.7069, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.9835559025534641, + "learning_rate": 8.837209302325582e-06, + "loss": 0.7081, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.9845261403594612, + "learning_rate": 8.888888888888888e-06, + "loss": 0.6452, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 2.1896583808017813, + "learning_rate": 8.940568475452197e-06, + "loss": 0.6978, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 2.034374240576255, + "learning_rate": 8.992248062015505e-06, + "loss": 0.7193, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 2.254205482228632, + "learning_rate": 9.043927648578812e-06, + "loss": 0.7259, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 1.9883989999974647, + "learning_rate": 9.09560723514212e-06, + "loss": 0.6599, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 2.0406451751506136, + "learning_rate": 9.147286821705427e-06, + "loss": 0.6291, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 2.0446696740745978, + "learning_rate": 9.198966408268735e-06, + "loss": 0.643, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 2.0482308359887362, + "learning_rate": 9.250645994832042e-06, + "loss": 0.7337, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 1.9553371871357939, + "learning_rate": 9.30232558139535e-06, + "loss": 0.6478, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.924814844515946, + "learning_rate": 9.354005167958657e-06, + "loss": 0.6263, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 2.047266915344053, + "learning_rate": 9.405684754521964e-06, + "loss": 0.6979, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 1.842052712981622, + "learning_rate": 9.457364341085272e-06, + "loss": 0.6163, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 2.2294991384925043, + "learning_rate": 9.509043927648579e-06, + "loss": 0.6639, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 2.1086576018925034, + "learning_rate": 9.560723514211887e-06, + "loss": 0.6987, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 2.413299538742816, + "learning_rate": 9.612403100775196e-06, + "loss": 0.6821, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 2.2113895021622576, + "learning_rate": 9.664082687338502e-06, + "loss": 0.6325, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 2.312092100717908, + "learning_rate": 9.71576227390181e-06, + "loss": 0.7312, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 2.0110742814407456, + "learning_rate": 9.767441860465117e-06, + "loss": 0.7209, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 2.003722660213722, + "learning_rate": 9.819121447028424e-06, + "loss": 0.7655, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 2.066364251847548, + "learning_rate": 9.870801033591732e-06, + "loss": 0.6983, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 2.160208896319642, + "learning_rate": 9.922480620155039e-06, + "loss": 0.7163, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 1.969679234720519, + "learning_rate": 9.974160206718347e-06, + "loss": 0.7713, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.7571093021627038, + "learning_rate": 1.0025839793281656e-05, + "loss": 0.6597, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.9672538279531169, + "learning_rate": 1.0077519379844963e-05, + "loss": 0.7103, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 2.1541642868451882, + "learning_rate": 1.012919896640827e-05, + "loss": 0.7753, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.8365651457895535, + "learning_rate": 1.0180878552971578e-05, + "loss": 0.7249, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.9360304766258072, + "learning_rate": 1.0232558139534884e-05, + "loss": 0.6853, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 2.096884473617233, + "learning_rate": 1.0284237726098191e-05, + "loss": 0.7369, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 1.8273847988781062, + "learning_rate": 1.03359173126615e-05, + "loss": 0.7152, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.8844761759607553, + "learning_rate": 1.0387596899224808e-05, + "loss": 0.6734, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 1.914089218264573, + "learning_rate": 1.0439276485788114e-05, + "loss": 0.7143, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 2.116071906845655, + "learning_rate": 1.0490956072351421e-05, + "loss": 0.7001, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 2.0417825964367657, + "learning_rate": 1.0542635658914731e-05, + "loss": 0.7866, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 2.044203084483132, + "learning_rate": 1.0594315245478038e-05, + "loss": 0.6179, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 2.0420830230723035, + "learning_rate": 1.0645994832041345e-05, + "loss": 0.7124, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 2.1051199487865997, + "learning_rate": 1.0697674418604651e-05, + "loss": 0.7409, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.8155533295501942, + "learning_rate": 1.074935400516796e-05, + "loss": 0.6941, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.9123865380632588, + "learning_rate": 1.0801033591731266e-05, + "loss": 0.7245, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.9133835814631164, + "learning_rate": 1.0852713178294573e-05, + "loss": 0.6549, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 1.7395522368006386, + "learning_rate": 1.0904392764857883e-05, + "loss": 0.7039, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 1.8978653033690394, + "learning_rate": 1.095607235142119e-05, + "loss": 0.6929, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 2.0391466846845905, + "learning_rate": 1.1007751937984497e-05, + "loss": 0.7141, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 1.7644765714244997, + "learning_rate": 1.1059431524547803e-05, + "loss": 0.6726, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 1.834831131693005, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.6791, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 1.9981262847543069, + "learning_rate": 1.116279069767442e-05, + "loss": 0.6801, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.8607705913420696, + "learning_rate": 1.1214470284237727e-05, + "loss": 0.6662, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 1.9942224020634212, + "learning_rate": 1.1266149870801035e-05, + "loss": 0.7746, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 1.9566979647943363, + "learning_rate": 1.1317829457364342e-05, + "loss": 0.7071, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 2.0894906289355366, + "learning_rate": 1.1369509043927648e-05, + "loss": 0.7398, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.902596020274279, + "learning_rate": 1.1421188630490959e-05, + "loss": 0.6448, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 1.8516721411325656, + "learning_rate": 1.1472868217054265e-05, + "loss": 0.6533, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.9422069717376376, + "learning_rate": 1.1524547803617572e-05, + "loss": 0.7712, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 2.029117344982057, + "learning_rate": 1.1576227390180879e-05, + "loss": 0.6953, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.9938398502397154, + "learning_rate": 1.1627906976744187e-05, + "loss": 0.6658, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 2.007778419769434, + "learning_rate": 1.1679586563307494e-05, + "loss": 0.6895, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 1.8139338248104935, + "learning_rate": 1.1731266149870802e-05, + "loss": 0.6562, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 1.80716525321881, + "learning_rate": 1.178294573643411e-05, + "loss": 0.7093, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 2.1476574400631026, + "learning_rate": 1.1834625322997417e-05, + "loss": 0.7744, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 2.0661453630430664, + "learning_rate": 1.1886304909560724e-05, + "loss": 0.7099, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 2.1125412525008986, + "learning_rate": 1.193798449612403e-05, + "loss": 0.7794, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 1.8573275922237884, + "learning_rate": 1.198966408268734e-05, + "loss": 0.6658, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 1.8576416784089278, + "learning_rate": 1.2041343669250647e-05, + "loss": 0.6768, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 1.922463427767487, + "learning_rate": 1.2093023255813954e-05, + "loss": 0.7218, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 1.9144802338323745, + "learning_rate": 1.2144702842377262e-05, + "loss": 0.7292, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.8917235028415234, + "learning_rate": 1.2196382428940569e-05, + "loss": 0.6994, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 1.7192218479626784, + "learning_rate": 1.2248062015503876e-05, + "loss": 0.6272, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 2.059429194615707, + "learning_rate": 1.2299741602067184e-05, + "loss": 0.6372, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 1.8141280130390114, + "learning_rate": 1.2351421188630493e-05, + "loss": 0.7174, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 1.8352882698287405, + "learning_rate": 1.24031007751938e-05, + "loss": 0.6856, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.8149697807255358, + "learning_rate": 1.2454780361757106e-05, + "loss": 0.6677, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.960900294950215, + "learning_rate": 1.2506459948320416e-05, + "loss": 0.7238, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 1.898006562980343, + "learning_rate": 1.2558139534883723e-05, + "loss": 0.7101, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 1.8680187593575954, + "learning_rate": 1.260981912144703e-05, + "loss": 0.744, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.9090085316303016, + "learning_rate": 1.2661498708010338e-05, + "loss": 0.6506, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 1.595526658646269, + "learning_rate": 1.2713178294573645e-05, + "loss": 0.6481, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 1.6954086909274888, + "learning_rate": 1.2764857881136951e-05, + "loss": 0.6882, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 2.0163576435791515, + "learning_rate": 1.2816537467700258e-05, + "loss": 0.7297, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.895852106301838, + "learning_rate": 1.2868217054263568e-05, + "loss": 0.7747, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 1.9365275003503144, + "learning_rate": 1.2919896640826875e-05, + "loss": 0.7125, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 1.708399523251377, + "learning_rate": 1.2971576227390181e-05, + "loss": 0.6563, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 1.7895994088720608, + "learning_rate": 1.302325581395349e-05, + "loss": 0.6928, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.8279126687923155, + "learning_rate": 1.3074935400516796e-05, + "loss": 0.7071, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 1.9455123859953396, + "learning_rate": 1.3126614987080105e-05, + "loss": 0.7194, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 1.8876212984229028, + "learning_rate": 1.3178294573643412e-05, + "loss": 0.7188, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 1.7747709663808793, + "learning_rate": 1.322997416020672e-05, + "loss": 0.6819, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 1.821885113172778, + "learning_rate": 1.3281653746770027e-05, + "loss": 0.694, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 1.91953029112429, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.7012, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 1.8682867030853527, + "learning_rate": 1.3385012919896643e-05, + "loss": 0.7418, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 1.7094975512995536, + "learning_rate": 1.343669250645995e-05, + "loss": 0.669, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 1.9776725818158296, + "learning_rate": 1.3488372093023257e-05, + "loss": 0.702, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 1.7692678899762055, + "learning_rate": 1.3540051679586565e-05, + "loss": 0.684, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 1.8705074530572576, + "learning_rate": 1.3591731266149872e-05, + "loss": 0.6216, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 2.034166560357307, + "learning_rate": 1.3643410852713179e-05, + "loss": 0.7239, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 1.9058190781105089, + "learning_rate": 1.3695090439276487e-05, + "loss": 0.7189, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 1.7473392695345875, + "learning_rate": 1.3746770025839795e-05, + "loss": 0.6897, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 1.8400541861475124, + "learning_rate": 1.3798449612403102e-05, + "loss": 0.6924, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.6957248590566456, + "learning_rate": 1.3850129198966409e-05, + "loss": 0.6723, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 1.8766758740788159, + "learning_rate": 1.3901808785529717e-05, + "loss": 0.7572, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 2.1911608988724023, + "learning_rate": 1.3953488372093025e-05, + "loss": 0.8032, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 1.7159125748885473, + "learning_rate": 1.4005167958656332e-05, + "loss": 0.6395, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 1.7821755263454584, + "learning_rate": 1.4056847545219639e-05, + "loss": 0.6888, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 1.8221975888290423, + "learning_rate": 1.4108527131782947e-05, + "loss": 0.727, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 1.7422530499772497, + "learning_rate": 1.4160206718346254e-05, + "loss": 0.6506, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 1.7871411930441092, + "learning_rate": 1.421188630490956e-05, + "loss": 0.731, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 1.7210350106250718, + "learning_rate": 1.426356589147287e-05, + "loss": 0.7012, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 1.7810746491438123, + "learning_rate": 1.4315245478036177e-05, + "loss": 0.6611, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 1.8839496644943539, + "learning_rate": 1.4366925064599484e-05, + "loss": 0.6647, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 1.8778032963553069, + "learning_rate": 1.441860465116279e-05, + "loss": 0.6531, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 1.8287017434168242, + "learning_rate": 1.44702842377261e-05, + "loss": 0.7095, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.8776542949642527, + "learning_rate": 1.4521963824289408e-05, + "loss": 0.7421, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 1.9099247641999344, + "learning_rate": 1.4573643410852714e-05, + "loss": 0.6702, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 1.8216094934275595, + "learning_rate": 1.4625322997416023e-05, + "loss": 0.6371, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 1.8987344266291748, + "learning_rate": 1.467700258397933e-05, + "loss": 0.6714, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.7156933741329128, + "learning_rate": 1.4728682170542636e-05, + "loss": 0.7229, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 1.8087905546016818, + "learning_rate": 1.4780361757105946e-05, + "loss": 0.7129, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 1.9409079574230799, + "learning_rate": 1.4832041343669253e-05, + "loss": 0.7056, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 1.6063497311990749, + "learning_rate": 1.488372093023256e-05, + "loss": 0.6908, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.9730694328654426, + "learning_rate": 1.4935400516795866e-05, + "loss": 0.6481, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 1.5952757469673713, + "learning_rate": 1.4987080103359175e-05, + "loss": 0.6512, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 1.98427725723922, + "learning_rate": 1.5038759689922481e-05, + "loss": 0.7126, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 1.9416110815297036, + "learning_rate": 1.5090439276485788e-05, + "loss": 0.7322, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 1.8932289917607776, + "learning_rate": 1.5142118863049098e-05, + "loss": 0.6591, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 1.7060463724441401, + "learning_rate": 1.5193798449612405e-05, + "loss": 0.6618, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 1.796732158788336, + "learning_rate": 1.5245478036175711e-05, + "loss": 0.6555, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.6835650023375834, + "learning_rate": 1.5297157622739018e-05, + "loss": 0.6566, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 2.1006441626998402, + "learning_rate": 1.5348837209302328e-05, + "loss": 0.7364, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 1.8399146971417395, + "learning_rate": 1.5400516795865635e-05, + "loss": 0.7592, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 1.972446906435267, + "learning_rate": 1.545219638242894e-05, + "loss": 0.7628, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 2.082671161958546, + "learning_rate": 1.550387596899225e-05, + "loss": 0.7714, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.8677547981650926, + "learning_rate": 1.555555555555556e-05, + "loss": 0.754, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.9676800802722405, + "learning_rate": 1.5607235142118865e-05, + "loss": 0.6773, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 1.9592992010467447, + "learning_rate": 1.5658914728682172e-05, + "loss": 0.6701, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 1.7348913163105608, + "learning_rate": 1.571059431524548e-05, + "loss": 0.6879, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.609107874950825, + "learning_rate": 1.5762273901808785e-05, + "loss": 0.6654, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 1.6397025285151703, + "learning_rate": 1.5813953488372095e-05, + "loss": 0.6972, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 1.7851211911688332, + "learning_rate": 1.5865633074935402e-05, + "loss": 0.6876, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 1.9905356346683476, + "learning_rate": 1.591731266149871e-05, + "loss": 0.7228, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 1.8863281098328457, + "learning_rate": 1.5968992248062015e-05, + "loss": 0.7152, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 1.690116408136723, + "learning_rate": 1.6020671834625325e-05, + "loss": 0.7461, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 1.6336463368807541, + "learning_rate": 1.6072351421188632e-05, + "loss": 0.6484, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 1.7284714170503186, + "learning_rate": 1.612403100775194e-05, + "loss": 0.6475, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 2.018403496545159, + "learning_rate": 1.6175710594315245e-05, + "loss": 0.7395, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 1.6333717228147262, + "learning_rate": 1.6227390180878555e-05, + "loss": 0.6498, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 1.6210224917273397, + "learning_rate": 1.6279069767441862e-05, + "loss": 0.657, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 1.9938309416887994, + "learning_rate": 1.633074935400517e-05, + "loss": 0.7661, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 1.8170192545498876, + "learning_rate": 1.638242894056848e-05, + "loss": 0.6766, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 1.8045935668279347, + "learning_rate": 1.6434108527131786e-05, + "loss": 0.6594, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 1.8042382015409661, + "learning_rate": 1.6485788113695092e-05, + "loss": 0.6845, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 1.5535666100814085, + "learning_rate": 1.65374677002584e-05, + "loss": 0.6375, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 1.6997108185464844, + "learning_rate": 1.6589147286821706e-05, + "loss": 0.6859, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 1.7942718225337062, + "learning_rate": 1.6640826873385016e-05, + "loss": 0.7387, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 1.6747243255820246, + "learning_rate": 1.6692506459948323e-05, + "loss": 0.7287, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.6454662283784893, + "learning_rate": 1.674418604651163e-05, + "loss": 0.6235, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.9309376362153705, + "learning_rate": 1.6795865633074936e-05, + "loss": 0.7316, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 1.7199744977834737, + "learning_rate": 1.6847545219638243e-05, + "loss": 0.6656, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 1.860879882125513, + "learning_rate": 1.689922480620155e-05, + "loss": 0.7451, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 1.6897752046902168, + "learning_rate": 1.695090439276486e-05, + "loss": 0.6836, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.8385185117767382, + "learning_rate": 1.7002583979328166e-05, + "loss": 0.7228, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 1.7139177537723818, + "learning_rate": 1.7054263565891473e-05, + "loss": 0.6212, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.9065972308984798, + "learning_rate": 1.7105943152454783e-05, + "loss": 0.736, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 1.597571037277131, + "learning_rate": 1.715762273901809e-05, + "loss": 0.7313, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.5904093867911444, + "learning_rate": 1.7209302325581396e-05, + "loss": 0.6848, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 1.7389702620812566, + "learning_rate": 1.7260981912144706e-05, + "loss": 0.7166, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 1.7843425073020225, + "learning_rate": 1.7312661498708013e-05, + "loss": 0.7396, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.6234645925689897, + "learning_rate": 1.736434108527132e-05, + "loss": 0.6724, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.7248537747976185, + "learning_rate": 1.7416020671834626e-05, + "loss": 0.6683, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 1.7138102899654486, + "learning_rate": 1.7467700258397936e-05, + "loss": 0.6655, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 1.9091086293921518, + "learning_rate": 1.7519379844961243e-05, + "loss": 0.6957, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 1.6963941929137416, + "learning_rate": 1.757105943152455e-05, + "loss": 0.6796, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.7328281549790534, + "learning_rate": 1.7622739018087857e-05, + "loss": 0.7055, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 1.589815631589682, + "learning_rate": 1.7674418604651163e-05, + "loss": 0.7067, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 1.736638216321355, + "learning_rate": 1.772609819121447e-05, + "loss": 0.7284, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 1.6392960751987011, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.6809, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 1.5876232640027388, + "learning_rate": 1.7829457364341087e-05, + "loss": 0.7757, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 1.5829662432851723, + "learning_rate": 1.7881136950904393e-05, + "loss": 0.7285, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 1.5552442311025254, + "learning_rate": 1.79328165374677e-05, + "loss": 0.6617, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 1.888963836541859, + "learning_rate": 1.798449612403101e-05, + "loss": 0.6744, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.7183686266782514, + "learning_rate": 1.8036175710594317e-05, + "loss": 0.7592, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 1.754612294162425, + "learning_rate": 1.8087855297157624e-05, + "loss": 0.7025, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 1.735550490673282, + "learning_rate": 1.813953488372093e-05, + "loss": 0.7654, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 1.8993842507501426, + "learning_rate": 1.819121447028424e-05, + "loss": 0.7315, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.7647160578883143, + "learning_rate": 1.8242894056847547e-05, + "loss": 0.7498, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 1.5622315748437299, + "learning_rate": 1.8294573643410854e-05, + "loss": 0.6246, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 1.6830378984478318, + "learning_rate": 1.8346253229974164e-05, + "loss": 0.6808, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 1.6788587788029756, + "learning_rate": 1.839793281653747e-05, + "loss": 0.6637, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 1.6853859693123143, + "learning_rate": 1.8449612403100777e-05, + "loss": 0.7021, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 1.7256901120754673, + "learning_rate": 1.8501291989664084e-05, + "loss": 0.6518, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 1.693381902243467, + "learning_rate": 1.855297157622739e-05, + "loss": 0.7455, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 1.743409280064828, + "learning_rate": 1.86046511627907e-05, + "loss": 0.7063, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.8010066423013003, + "learning_rate": 1.8656330749354007e-05, + "loss": 0.7634, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 1.7266054450170285, + "learning_rate": 1.8708010335917314e-05, + "loss": 0.6595, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 1.7604733821001817, + "learning_rate": 1.875968992248062e-05, + "loss": 0.6977, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 1.5710357215214774, + "learning_rate": 1.8811369509043927e-05, + "loss": 0.7089, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 1.7716215810245508, + "learning_rate": 1.8863049095607237e-05, + "loss": 0.6371, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 1.5191226988578574, + "learning_rate": 1.8914728682170544e-05, + "loss": 0.6987, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 1.6351020494163706, + "learning_rate": 1.896640826873385e-05, + "loss": 0.7209, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 1.427810270324823, + "learning_rate": 1.9018087855297158e-05, + "loss": 0.672, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 1.5550164876229366, + "learning_rate": 1.9069767441860468e-05, + "loss": 0.7001, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 1.6459538701338634, + "learning_rate": 1.9121447028423774e-05, + "loss": 0.8093, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 1.6916808859066244, + "learning_rate": 1.917312661498708e-05, + "loss": 0.8031, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 1.792743477559064, + "learning_rate": 1.922480620155039e-05, + "loss": 0.7294, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 1.6043612704787404, + "learning_rate": 1.9276485788113698e-05, + "loss": 0.6933, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 1.7592731426018393, + "learning_rate": 1.9328165374677004e-05, + "loss": 0.7709, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 1.6771718193388752, + "learning_rate": 1.937984496124031e-05, + "loss": 0.6986, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 1.6994093514015336, + "learning_rate": 1.943152454780362e-05, + "loss": 0.7417, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 1.7915316538248756, + "learning_rate": 1.9483204134366928e-05, + "loss": 0.7313, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 2.0338506629382898, + "learning_rate": 1.9534883720930235e-05, + "loss": 0.7696, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 1.960364695017877, + "learning_rate": 1.958656330749354e-05, + "loss": 0.7206, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 1.8772124271125357, + "learning_rate": 1.9638242894056848e-05, + "loss": 0.7129, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 1.9636413049756116, + "learning_rate": 1.9689922480620155e-05, + "loss": 0.7082, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 1.6527663247417765, + "learning_rate": 1.9741602067183465e-05, + "loss": 0.6949, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 1.6633158696747927, + "learning_rate": 1.979328165374677e-05, + "loss": 0.6722, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 1.721959810193357, + "learning_rate": 1.9844961240310078e-05, + "loss": 0.7567, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.7364744247060135, + "learning_rate": 1.9896640826873385e-05, + "loss": 0.7176, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 1.7954586915735413, + "learning_rate": 1.9948320413436695e-05, + "loss": 0.7535, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 1.7346229719620336, + "learning_rate": 2e-05, + "loss": 0.7229, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 1.7528736498703117, + "learning_rate": 1.9999999684324205e-05, + "loss": 0.7176, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 1.5477324912304453, + "learning_rate": 1.9999998737296837e-05, + "loss": 0.6282, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 1.7132943002853476, + "learning_rate": 1.9999997158917953e-05, + "loss": 0.7502, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 1.6280519729022618, + "learning_rate": 1.9999994949187657e-05, + "loss": 0.6189, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 1.5954083061779611, + "learning_rate": 1.9999992108106083e-05, + "loss": 0.7113, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 1.573154667243791, + "learning_rate": 1.9999988635673414e-05, + "loss": 0.6588, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 1.7079942025481558, + "learning_rate": 1.9999984531889875e-05, + "loss": 0.6982, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 1.6879653995321409, + "learning_rate": 1.9999979796755715e-05, + "loss": 0.666, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 1.688187141390456, + "learning_rate": 1.999997443027124e-05, + "loss": 0.6362, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 1.610216531823565, + "learning_rate": 1.9999968432436785e-05, + "loss": 0.6575, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 2.3236145772969543, + "learning_rate": 1.9999961803252726e-05, + "loss": 0.7453, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 2.6347959388024034, + "learning_rate": 1.999995454271949e-05, + "loss": 0.7709, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 1.6901873630291864, + "learning_rate": 1.999994665083753e-05, + "loss": 0.7524, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 1.59903856836198, + "learning_rate": 1.9999938127607342e-05, + "loss": 0.6619, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 1.5768222530583047, + "learning_rate": 1.9999928973029472e-05, + "loss": 0.6928, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 1.6138677754701747, + "learning_rate": 1.999991918710449e-05, + "loss": 0.7036, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 1.557050816039296, + "learning_rate": 1.999990876983302e-05, + "loss": 0.6806, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 1.8214136157770862, + "learning_rate": 1.999989772121571e-05, + "loss": 0.7474, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 1.776652912399176, + "learning_rate": 1.999988604125327e-05, + "loss": 0.7904, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 1.6106579305815039, + "learning_rate": 1.9999873729946432e-05, + "loss": 0.7247, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 1.614877571449385, + "learning_rate": 1.999986078729597e-05, + "loss": 0.6763, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 1.5951139560039433, + "learning_rate": 1.9999847213302703e-05, + "loss": 0.6638, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 2.9395387757454956, + "learning_rate": 1.999983300796749e-05, + "loss": 0.6751, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 1.664135711921985, + "learning_rate": 1.999981817129123e-05, + "loss": 0.7085, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 1.6385503821637333, + "learning_rate": 1.9999802703274854e-05, + "loss": 0.7145, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 1.801231013667197, + "learning_rate": 1.9999786603919343e-05, + "loss": 0.7448, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 1.7969883758602014, + "learning_rate": 1.9999769873225706e-05, + "loss": 0.7637, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 1.736104635438498, + "learning_rate": 1.999975251119501e-05, + "loss": 0.7366, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 1.5335612079356686, + "learning_rate": 1.9999734517828345e-05, + "loss": 0.6593, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 1.6600035114710792, + "learning_rate": 1.999971589312685e-05, + "loss": 0.7751, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 1.7123815982846584, + "learning_rate": 1.9999696637091698e-05, + "loss": 0.7392, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 1.630631665076908, + "learning_rate": 1.9999676749724103e-05, + "loss": 0.7332, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 1.6337467422978231, + "learning_rate": 1.9999656231025323e-05, + "loss": 0.6824, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 1.5854489931811897, + "learning_rate": 1.9999635080996655e-05, + "loss": 0.698, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 1.674222207610974, + "learning_rate": 1.9999613299639433e-05, + "loss": 0.7406, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 1.6331969178252201, + "learning_rate": 1.9999590886955033e-05, + "loss": 0.7627, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 1.7665659329947683, + "learning_rate": 1.999956784294487e-05, + "loss": 0.7273, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 1.642254846355217, + "learning_rate": 1.9999544167610396e-05, + "loss": 0.6495, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 1.6550105513498534, + "learning_rate": 1.999951986095311e-05, + "loss": 0.7321, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 1.6336809249086557, + "learning_rate": 1.9999494922974544e-05, + "loss": 0.7034, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 1.5903316564562422, + "learning_rate": 1.9999469353676272e-05, + "loss": 0.7146, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 1.652225354870139, + "learning_rate": 1.999944315305991e-05, + "loss": 0.7124, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 1.6051610944242776, + "learning_rate": 1.999941632112711e-05, + "loss": 0.7083, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 1.6624895281927967, + "learning_rate": 1.9999388857879568e-05, + "loss": 0.6793, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 1.5903397519817577, + "learning_rate": 1.9999360763319015e-05, + "loss": 0.7418, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 1.6496411308973509, + "learning_rate": 1.999933203744723e-05, + "loss": 0.699, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 1.450748921056225, + "learning_rate": 1.9999302680266023e-05, + "loss": 0.6681, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 1.4417140394303158, + "learning_rate": 1.9999272691777246e-05, + "loss": 0.6271, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 1.6257541080465774, + "learning_rate": 1.99992420719828e-05, + "loss": 0.6785, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 1.4910585616627743, + "learning_rate": 1.999921082088461e-05, + "loss": 0.7389, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 1.6186063135659114, + "learning_rate": 1.999917893848465e-05, + "loss": 0.6467, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 1.6937318104122674, + "learning_rate": 1.9999146424784938e-05, + "loss": 0.7253, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 1.5915691938396197, + "learning_rate": 1.9999113279787517e-05, + "loss": 0.7449, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 1.7866388420647545, + "learning_rate": 1.9999079503494496e-05, + "loss": 0.7856, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 1.6184684361257564, + "learning_rate": 1.9999045095907988e-05, + "loss": 0.7174, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 1.3692111923599486, + "learning_rate": 1.9999010057030183e-05, + "loss": 0.661, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 1.647713814205106, + "learning_rate": 1.9998974386863276e-05, + "loss": 0.6867, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 1.9018199285631465, + "learning_rate": 1.9998938085409534e-05, + "loss": 0.731, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 1.562980883031733, + "learning_rate": 1.9998901152671243e-05, + "loss": 0.6987, + "step": 446 + }, + { + "epoch": 0.03, + "grad_norm": 1.6468384566767702, + "learning_rate": 1.9998863588650732e-05, + "loss": 0.7059, + "step": 447 + }, + { + "epoch": 0.03, + "grad_norm": 1.7768572801799298, + "learning_rate": 1.9998825393350375e-05, + "loss": 0.7415, + "step": 448 + }, + { + "epoch": 0.03, + "grad_norm": 1.6625920635919154, + "learning_rate": 1.999878656677259e-05, + "loss": 0.7469, + "step": 449 + }, + { + "epoch": 0.03, + "grad_norm": 1.6094740309384488, + "learning_rate": 1.9998747108919815e-05, + "loss": 0.6855, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 1.991797913997878, + "learning_rate": 1.999870701979455e-05, + "loss": 0.7273, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 1.7667346269503605, + "learning_rate": 1.9998666299399326e-05, + "loss": 0.7284, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 1.766153239672727, + "learning_rate": 1.999862494773671e-05, + "loss": 0.7407, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 1.656801599742351, + "learning_rate": 1.9998582964809317e-05, + "loss": 0.7124, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 1.7539647693684355, + "learning_rate": 1.9998540350619793e-05, + "loss": 0.7933, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 1.668483824343145, + "learning_rate": 1.9998497105170833e-05, + "loss": 0.6857, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 1.7266829090508202, + "learning_rate": 1.9998453228465165e-05, + "loss": 0.7264, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 1.504589213075702, + "learning_rate": 1.999840872050556e-05, + "loss": 0.6976, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 1.5411625201048247, + "learning_rate": 1.999836358129483e-05, + "loss": 0.6803, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 1.6495119182136446, + "learning_rate": 1.9998317810835815e-05, + "loss": 0.6846, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.5180157554522513, + "learning_rate": 1.9998271409131417e-05, + "loss": 0.7037, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 1.7127400835580084, + "learning_rate": 1.999822437618456e-05, + "loss": 0.7223, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 1.608780167659147, + "learning_rate": 1.9998176711998215e-05, + "loss": 0.7638, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 1.5538310083092766, + "learning_rate": 1.999812841657539e-05, + "loss": 0.7472, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 1.6357233849199189, + "learning_rate": 1.9998079489919134e-05, + "loss": 0.7257, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 1.5671861186454064, + "learning_rate": 1.9998029932032534e-05, + "loss": 0.6688, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 1.5890134850728637, + "learning_rate": 1.9997979742918723e-05, + "loss": 0.7639, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 1.5196065614370386, + "learning_rate": 1.999792892258087e-05, + "loss": 0.6138, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 1.5383873247623239, + "learning_rate": 1.9997877471022182e-05, + "loss": 0.6591, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 1.645732015396448, + "learning_rate": 1.9997825388245905e-05, + "loss": 0.7215, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 1.524508212479353, + "learning_rate": 1.999777267425533e-05, + "loss": 0.6901, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 1.481353656344975, + "learning_rate": 1.9997719329053782e-05, + "loss": 0.7021, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 1.6202897985297207, + "learning_rate": 1.999766535264463e-05, + "loss": 0.7857, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 1.5237698705940714, + "learning_rate": 1.9997610745031292e-05, + "loss": 0.6699, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 1.6958931486685858, + "learning_rate": 1.99975555062172e-05, + "loss": 0.7847, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 1.8245844668413667, + "learning_rate": 1.9997499636205847e-05, + "loss": 0.7416, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 1.7432899579231305, + "learning_rate": 1.9997443135000765e-05, + "loss": 0.7082, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 1.6527655313431695, + "learning_rate": 1.9997386002605515e-05, + "loss": 0.7221, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 1.526372576391039, + "learning_rate": 1.999732823902371e-05, + "loss": 0.7254, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 1.50463548295059, + "learning_rate": 1.9997269844258993e-05, + "loss": 0.6663, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 1.5833940828949773, + "learning_rate": 1.999721081831505e-05, + "loss": 0.6769, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 1.5093683309782873, + "learning_rate": 1.9997151161195613e-05, + "loss": 0.6292, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 1.5451827377593428, + "learning_rate": 1.9997090872904442e-05, + "loss": 0.7566, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 1.652419356832569, + "learning_rate": 1.9997029953445345e-05, + "loss": 0.6829, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 1.6704165791129075, + "learning_rate": 1.999696840282217e-05, + "loss": 0.678, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 1.6164923754204596, + "learning_rate": 1.9996906221038802e-05, + "loss": 0.7532, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 1.5693936672642468, + "learning_rate": 1.999684340809917e-05, + "loss": 0.6923, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 1.678340069789809, + "learning_rate": 1.9996779964007232e-05, + "loss": 0.6951, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 1.4737192056559285, + "learning_rate": 1.9996715888767e-05, + "loss": 0.6884, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 1.4676956288112966, + "learning_rate": 1.9996651182382518e-05, + "loss": 0.7117, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 1.4944319696547814, + "learning_rate": 1.999658584485787e-05, + "loss": 0.6733, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 1.5298826379579882, + "learning_rate": 1.9996519876197185e-05, + "loss": 0.7322, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 1.5099953776408808, + "learning_rate": 1.999645327640462e-05, + "loss": 0.7466, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 1.4888009192841576, + "learning_rate": 1.999638604548439e-05, + "loss": 0.725, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 1.4929981526224776, + "learning_rate": 1.9996318183440732e-05, + "loss": 0.7063, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 1.598305493345369, + "learning_rate": 1.9996249690277934e-05, + "loss": 0.7423, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 1.4226248304572933, + "learning_rate": 1.999618056600032e-05, + "loss": 0.6704, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 1.7489322402155776, + "learning_rate": 1.999611081061225e-05, + "loss": 0.725, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 1.5527406704373163, + "learning_rate": 1.999604042411813e-05, + "loss": 0.6761, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 1.769345305317264, + "learning_rate": 1.9995969406522412e-05, + "loss": 0.7771, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.6652858338714835, + "learning_rate": 1.9995897757829564e-05, + "loss": 0.7055, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 1.663801217498132, + "learning_rate": 1.9995825478044126e-05, + "loss": 0.7385, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 1.7078167048871067, + "learning_rate": 1.999575256717065e-05, + "loss": 0.7001, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 1.5668344269206198, + "learning_rate": 1.9995679025213747e-05, + "loss": 0.6665, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 1.6364168239147343, + "learning_rate": 1.9995604852178055e-05, + "loss": 0.7096, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 1.5765966436282393, + "learning_rate": 1.9995530048068253e-05, + "loss": 0.7309, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 1.5192936807178008, + "learning_rate": 1.9995454612889076e-05, + "loss": 0.6978, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 1.6293084908077977, + "learning_rate": 1.9995378546645274e-05, + "loss": 0.6854, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 1.667735456460159, + "learning_rate": 1.999530184934166e-05, + "loss": 0.7553, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 1.5729917377805436, + "learning_rate": 1.9995224520983068e-05, + "loss": 0.6877, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 1.564234495661959, + "learning_rate": 1.9995146561574384e-05, + "loss": 0.6676, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 1.5438824616101234, + "learning_rate": 1.9995067971120527e-05, + "loss": 0.6212, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.8259770743360872, + "learning_rate": 1.9994988749626464e-05, + "loss": 0.702, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 1.5362335385464132, + "learning_rate": 1.9994908897097195e-05, + "loss": 0.7317, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 1.6953742372021292, + "learning_rate": 1.999482841353776e-05, + "loss": 0.7131, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 1.6076065272604732, + "learning_rate": 1.999474729895324e-05, + "loss": 0.7146, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 1.477019869898655, + "learning_rate": 1.9994665553348757e-05, + "loss": 0.6224, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 1.5236802910247287, + "learning_rate": 1.999458317672947e-05, + "loss": 0.633, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 1.4541173028446257, + "learning_rate": 1.9994500169100583e-05, + "loss": 0.7029, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 1.4766634951974353, + "learning_rate": 1.9994416530467336e-05, + "loss": 0.6331, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 1.688122387304618, + "learning_rate": 1.9994332260835007e-05, + "loss": 0.7227, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 1.4646164374410906, + "learning_rate": 1.9994247360208924e-05, + "loss": 0.675, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 1.5011274551065308, + "learning_rate": 1.9994161828594435e-05, + "loss": 0.6439, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 1.6908090431319895, + "learning_rate": 1.9994075665996952e-05, + "loss": 0.6839, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 1.5981954023945975, + "learning_rate": 1.9993988872421902e-05, + "loss": 0.7136, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 1.5478585709374062, + "learning_rate": 1.999390144787478e-05, + "loss": 0.7035, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 1.4579428104492471, + "learning_rate": 1.9993813392361095e-05, + "loss": 0.6551, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 1.5187776947636051, + "learning_rate": 1.999372470588641e-05, + "loss": 0.6541, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 1.4904409842799748, + "learning_rate": 1.999363538845632e-05, + "loss": 0.6434, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 1.619864565403804, + "learning_rate": 1.9993545440076473e-05, + "loss": 0.7437, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 1.6021414105917995, + "learning_rate": 1.9993454860752538e-05, + "loss": 0.6774, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 1.5584038257804533, + "learning_rate": 1.999336365049024e-05, + "loss": 0.6537, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 1.5625273892863119, + "learning_rate": 1.9993271809295337e-05, + "loss": 0.7039, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 1.4000568497559387, + "learning_rate": 1.9993179337173624e-05, + "loss": 0.6784, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 1.828352824303991, + "learning_rate": 1.9993086234130944e-05, + "loss": 0.732, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 1.6990779423744238, + "learning_rate": 1.999299250017317e-05, + "loss": 0.6641, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 1.604924316166706, + "learning_rate": 1.9992898135306223e-05, + "loss": 0.7196, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 1.6965750638194799, + "learning_rate": 1.999280313953606e-05, + "loss": 0.6855, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 1.5202492241075667, + "learning_rate": 1.999270751286868e-05, + "loss": 0.6212, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 1.6066269606647534, + "learning_rate": 1.9992611255310115e-05, + "loss": 0.7084, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 1.6071333597296182, + "learning_rate": 1.9992514366866453e-05, + "loss": 0.6485, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 1.5036425867189884, + "learning_rate": 1.9992416847543802e-05, + "loss": 0.7262, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 1.5939623373361334, + "learning_rate": 1.9992318697348318e-05, + "loss": 0.6811, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 1.8675489514738624, + "learning_rate": 1.9992219916286205e-05, + "loss": 0.7211, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 1.497878083787526, + "learning_rate": 1.9992120504363694e-05, + "loss": 0.6811, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 1.4579373321565814, + "learning_rate": 1.9992020461587063e-05, + "loss": 0.6915, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 1.591533990226762, + "learning_rate": 1.9991919787962627e-05, + "loss": 0.6991, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 1.5554914839734313, + "learning_rate": 1.9991818483496747e-05, + "loss": 0.6941, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 1.587951508841042, + "learning_rate": 1.999171654819581e-05, + "loss": 0.667, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 1.4680807333841988, + "learning_rate": 1.999161398206626e-05, + "loss": 0.6474, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 1.5560024968546469, + "learning_rate": 1.999151078511457e-05, + "loss": 0.6836, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 1.6578572949425674, + "learning_rate": 1.999140695734725e-05, + "loss": 0.6271, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 1.5658728245412266, + "learning_rate": 1.9991302498770867e-05, + "loss": 0.7176, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 1.5784429097767325, + "learning_rate": 1.9991197409392004e-05, + "loss": 0.6789, + "step": 554 + }, + { + "epoch": 0.04, + "grad_norm": 1.6372971707589818, + "learning_rate": 1.9991091689217303e-05, + "loss": 0.703, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 1.421081573115393, + "learning_rate": 1.9990985338253434e-05, + "loss": 0.7214, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 1.5860319062495565, + "learning_rate": 1.9990878356507116e-05, + "loss": 0.6939, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 1.6161604121398174, + "learning_rate": 1.99907707439851e-05, + "loss": 0.7171, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 1.587284662254865, + "learning_rate": 1.9990662500694183e-05, + "loss": 0.7268, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 1.5060141319381881, + "learning_rate": 1.9990553626641194e-05, + "loss": 0.698, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 1.4562455173145112, + "learning_rate": 1.9990444121833016e-05, + "loss": 0.658, + "step": 561 + }, + { + "epoch": 0.04, + "grad_norm": 1.4167032611084793, + "learning_rate": 1.9990333986276552e-05, + "loss": 0.6858, + "step": 562 + }, + { + "epoch": 0.04, + "grad_norm": 1.5886033676755311, + "learning_rate": 1.999022321997876e-05, + "loss": 0.6974, + "step": 563 + }, + { + "epoch": 0.04, + "grad_norm": 1.5288863331350755, + "learning_rate": 1.9990111822946634e-05, + "loss": 0.6808, + "step": 564 + }, + { + "epoch": 0.04, + "grad_norm": 1.2469602818591705, + "learning_rate": 1.9989999795187206e-05, + "loss": 0.6634, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 1.4832806217100252, + "learning_rate": 1.998988713670755e-05, + "loss": 0.6709, + "step": 566 + }, + { + "epoch": 0.04, + "grad_norm": 1.5277468254344768, + "learning_rate": 1.998977384751478e-05, + "loss": 0.7142, + "step": 567 + }, + { + "epoch": 0.04, + "grad_norm": 1.5811186996046123, + "learning_rate": 1.9989659927616044e-05, + "loss": 0.7135, + "step": 568 + }, + { + "epoch": 0.04, + "grad_norm": 1.6281699293384213, + "learning_rate": 1.9989545377018538e-05, + "loss": 0.6752, + "step": 569 + }, + { + "epoch": 0.04, + "grad_norm": 1.5158983750917991, + "learning_rate": 1.9989430195729494e-05, + "loss": 0.6805, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 1.6764062814021865, + "learning_rate": 1.998931438375618e-05, + "loss": 0.7161, + "step": 571 + }, + { + "epoch": 0.04, + "grad_norm": 1.4806756753167973, + "learning_rate": 1.998919794110591e-05, + "loss": 0.7263, + "step": 572 + }, + { + "epoch": 0.04, + "grad_norm": 1.4557835859341586, + "learning_rate": 1.998908086778604e-05, + "loss": 0.7002, + "step": 573 + }, + { + "epoch": 0.04, + "grad_norm": 1.5920415959315477, + "learning_rate": 1.9988963163803958e-05, + "loss": 0.7437, + "step": 574 + }, + { + "epoch": 0.04, + "grad_norm": 1.5396736596865324, + "learning_rate": 1.9988844829167092e-05, + "loss": 0.6971, + "step": 575 + }, + { + "epoch": 0.04, + "grad_norm": 1.6295908854475456, + "learning_rate": 1.9988725863882922e-05, + "loss": 0.7321, + "step": 576 + }, + { + "epoch": 0.04, + "grad_norm": 1.4903184459565266, + "learning_rate": 1.998860626795895e-05, + "loss": 0.69, + "step": 577 + }, + { + "epoch": 0.04, + "grad_norm": 1.5427135401330314, + "learning_rate": 1.998848604140273e-05, + "loss": 0.6935, + "step": 578 + }, + { + "epoch": 0.04, + "grad_norm": 1.4066609100175793, + "learning_rate": 1.998836518422185e-05, + "loss": 0.6507, + "step": 579 + }, + { + "epoch": 0.04, + "grad_norm": 1.5534434492894746, + "learning_rate": 1.9988243696423947e-05, + "loss": 0.6572, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 1.5255378525100314, + "learning_rate": 1.9988121578016683e-05, + "loss": 0.693, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 1.7531413086049314, + "learning_rate": 1.9987998829007775e-05, + "loss": 0.7463, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 1.501387987108143, + "learning_rate": 1.9987875449404965e-05, + "loss": 0.6806, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 1.5181521553761232, + "learning_rate": 1.998775143921605e-05, + "loss": 0.69, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 1.7491486385822463, + "learning_rate": 1.9987626798448858e-05, + "loss": 0.6683, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 1.5476621686426533, + "learning_rate": 1.9987501527111253e-05, + "loss": 0.6665, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 1.5106475741270504, + "learning_rate": 1.9987375625211155e-05, + "loss": 0.6797, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 1.5416137926811995, + "learning_rate": 1.99872490927565e-05, + "loss": 0.6635, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 1.5484727256917714, + "learning_rate": 1.9987121929755284e-05, + "loss": 0.6813, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 1.4053969444944152, + "learning_rate": 1.9986994136215533e-05, + "loss": 0.6086, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 1.4333779487023979, + "learning_rate": 1.9986865712145316e-05, + "loss": 0.6764, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 1.4560090025214762, + "learning_rate": 1.9986736657552742e-05, + "loss": 0.6517, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 2.267194743443912, + "learning_rate": 1.9986606972445956e-05, + "loss": 0.7425, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 1.579606748605446, + "learning_rate": 1.998647665683315e-05, + "loss": 0.7427, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 1.5422188397192407, + "learning_rate": 1.998634571072255e-05, + "loss": 0.701, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 1.6001968143530119, + "learning_rate": 1.998621413412242e-05, + "loss": 0.6989, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 1.5119614517202604, + "learning_rate": 1.998608192704107e-05, + "loss": 0.6976, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 1.607026098945014, + "learning_rate": 1.9985949089486847e-05, + "loss": 0.6309, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 1.4701575981050437, + "learning_rate": 1.998581562146814e-05, + "loss": 0.7231, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 1.6280420878966146, + "learning_rate": 1.998568152299337e-05, + "loss": 0.7428, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.3699178843009088, + "learning_rate": 1.9985546794071006e-05, + "loss": 0.6128, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 1.5123627151102557, + "learning_rate": 1.9985411434709553e-05, + "loss": 0.6496, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 1.4369175601743995, + "learning_rate": 1.998527544491756e-05, + "loss": 0.6871, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 1.5328196051974403, + "learning_rate": 1.998513882470361e-05, + "loss": 0.6637, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 1.4953447264703434, + "learning_rate": 1.998500157407633e-05, + "loss": 0.6365, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 1.4521188841982653, + "learning_rate": 1.9984863693044385e-05, + "loss": 0.6142, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 1.5928050212897833, + "learning_rate": 1.998472518161648e-05, + "loss": 0.7354, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 1.5919485944279186, + "learning_rate": 1.998458603980136e-05, + "loss": 0.7091, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 1.3083587891031876, + "learning_rate": 1.998444626760781e-05, + "loss": 0.6477, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 1.3466313984866822, + "learning_rate": 1.9984305865044654e-05, + "loss": 0.6531, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 1.5911592107315577, + "learning_rate": 1.9984164832120755e-05, + "loss": 0.7384, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 1.6353324900870023, + "learning_rate": 1.998402316884502e-05, + "loss": 0.7327, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 1.653308631779101, + "learning_rate": 1.998388087522639e-05, + "loss": 0.671, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 1.4320412526715238, + "learning_rate": 1.9983737951273854e-05, + "loss": 0.6804, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 1.5341690422480376, + "learning_rate": 1.9983594396996428e-05, + "loss": 0.705, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 1.6323730023277883, + "learning_rate": 1.998345021240318e-05, + "loss": 0.6864, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.5669000089982936, + "learning_rate": 1.9983305397503214e-05, + "loss": 0.636, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 1.501495490032769, + "learning_rate": 1.9983159952305668e-05, + "loss": 0.6606, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 1.5710918712936695, + "learning_rate": 1.998301387681973e-05, + "loss": 0.64, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 1.4704143752378123, + "learning_rate": 1.9982867171054622e-05, + "loss": 0.6388, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 1.5505054142084593, + "learning_rate": 1.9982719835019604e-05, + "loss": 0.7019, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 1.5782432889513178, + "learning_rate": 1.9982571868723975e-05, + "loss": 0.7124, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 1.5165856846299397, + "learning_rate": 1.9982423272177087e-05, + "loss": 0.6638, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 1.621152872133489, + "learning_rate": 1.998227404538831e-05, + "loss": 0.7407, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.5111365170462379, + "learning_rate": 1.998212418836707e-05, + "loss": 0.7225, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 1.5483996651316276, + "learning_rate": 1.998197370112283e-05, + "loss": 0.7263, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 1.399157163115521, + "learning_rate": 1.9981822583665094e-05, + "loss": 0.6447, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 1.4471568943214923, + "learning_rate": 1.9981670836003396e-05, + "loss": 0.6718, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 1.4901968575050888, + "learning_rate": 1.998151845814732e-05, + "loss": 0.6279, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 1.5009167571801751, + "learning_rate": 1.9981365450106484e-05, + "loss": 0.6916, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 1.597806816187955, + "learning_rate": 1.9981211811890554e-05, + "loss": 0.6974, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 1.576796548519489, + "learning_rate": 1.998105754350922e-05, + "loss": 0.7309, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 1.5228990434634961, + "learning_rate": 1.9980902644972234e-05, + "loss": 0.7332, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 1.5724283264516041, + "learning_rate": 1.998074711628937e-05, + "loss": 0.6653, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 1.5764066197560027, + "learning_rate": 1.9980590957470437e-05, + "loss": 0.6611, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 1.624152035648111, + "learning_rate": 1.9980434168525315e-05, + "loss": 0.6929, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 1.4293267467178583, + "learning_rate": 1.9980276749463886e-05, + "loss": 0.6679, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 1.5297459592159572, + "learning_rate": 1.9980118700296095e-05, + "loss": 0.647, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 1.5407633411746564, + "learning_rate": 1.997996002103192e-05, + "loss": 0.6684, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 1.4553574671565237, + "learning_rate": 1.997980071168138e-05, + "loss": 0.6923, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 1.5706140788465575, + "learning_rate": 1.9979640772254528e-05, + "loss": 0.7113, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 1.500585441782913, + "learning_rate": 1.997948020276147e-05, + "loss": 0.6686, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 1.6194846382554953, + "learning_rate": 1.9979319003212337e-05, + "loss": 0.696, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 1.6542761214754882, + "learning_rate": 1.997915717361731e-05, + "loss": 0.7594, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 1.5234030695227856, + "learning_rate": 1.9978994713986606e-05, + "loss": 0.7039, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 1.5943618048911075, + "learning_rate": 1.9978831624330483e-05, + "loss": 0.6635, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 1.4703588399364698, + "learning_rate": 1.997866790465923e-05, + "loss": 0.7, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 1.484639435353238, + "learning_rate": 1.997850355498319e-05, + "loss": 0.6823, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 1.4989152800964898, + "learning_rate": 1.9978338575312742e-05, + "loss": 0.7432, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 1.4212172108057681, + "learning_rate": 1.9978172965658297e-05, + "loss": 0.6346, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 1.5397759347630786, + "learning_rate": 1.997800672603031e-05, + "loss": 0.7287, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 1.471766681289549, + "learning_rate": 1.997783985643928e-05, + "loss": 0.6427, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 1.406277168859184, + "learning_rate": 1.997767235689574e-05, + "loss": 0.6361, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 1.540609366625842, + "learning_rate": 1.9977504227410268e-05, + "loss": 0.7265, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 1.4956908318409674, + "learning_rate": 1.9977335467993474e-05, + "loss": 0.7131, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 1.7909151171788398, + "learning_rate": 1.997716607865602e-05, + "loss": 0.6775, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 1.7897552082035544, + "learning_rate": 1.9976996059408595e-05, + "loss": 0.7351, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 1.4937434048686618, + "learning_rate": 1.997682541026193e-05, + "loss": 0.657, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 1.5901092382609006, + "learning_rate": 1.997665413122681e-05, + "loss": 0.7749, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 1.4276446980963735, + "learning_rate": 1.9976482222314034e-05, + "loss": 0.7159, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 1.5174070283919316, + "learning_rate": 1.997630968353447e-05, + "loss": 0.6609, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 1.586215741784857, + "learning_rate": 1.9976136514899e-05, + "loss": 0.6905, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 1.5148969651741977, + "learning_rate": 1.9975962716418565e-05, + "loss": 0.7307, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 1.4337425050813954, + "learning_rate": 1.9975788288104132e-05, + "loss": 0.712, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 1.3863645719164037, + "learning_rate": 1.997561322996672e-05, + "loss": 0.643, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 1.6304058974927758, + "learning_rate": 1.9975437542017372e-05, + "loss": 0.7391, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 1.5533666319136317, + "learning_rate": 1.9975261224267187e-05, + "loss": 0.6778, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 1.6759539606330542, + "learning_rate": 1.9975084276727298e-05, + "loss": 0.812, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 1.5050305412209861, + "learning_rate": 1.9974906699408874e-05, + "loss": 0.6326, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 1.428899912874864, + "learning_rate": 1.9974728492323122e-05, + "loss": 0.6581, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 1.6176892667761549, + "learning_rate": 1.9974549655481302e-05, + "loss": 0.7051, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 1.6651281249090957, + "learning_rate": 1.9974370188894698e-05, + "loss": 0.745, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 1.4779479777687863, + "learning_rate": 1.9974190092574645e-05, + "loss": 0.6708, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 1.4294174856552997, + "learning_rate": 1.997400936653251e-05, + "loss": 0.6468, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 1.585851807933539, + "learning_rate": 1.9973828010779702e-05, + "loss": 0.7352, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 1.5257996073144429, + "learning_rate": 1.9973646025327678e-05, + "loss": 0.686, + "step": 676 + }, + { + "epoch": 0.05, + "grad_norm": 1.4566771048747251, + "learning_rate": 1.9973463410187922e-05, + "loss": 0.6547, + "step": 677 + }, + { + "epoch": 0.05, + "grad_norm": 1.4923679582215776, + "learning_rate": 1.9973280165371964e-05, + "loss": 0.7021, + "step": 678 + }, + { + "epoch": 0.05, + "grad_norm": 1.619787806989808, + "learning_rate": 1.9973096290891374e-05, + "loss": 0.756, + "step": 679 + }, + { + "epoch": 0.05, + "grad_norm": 1.5322144254418917, + "learning_rate": 1.997291178675776e-05, + "loss": 0.6807, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 1.4520212719360082, + "learning_rate": 1.997272665298277e-05, + "loss": 0.6949, + "step": 681 + }, + { + "epoch": 0.05, + "grad_norm": 1.3920593151329614, + "learning_rate": 1.99725408895781e-05, + "loss": 0.6489, + "step": 682 + }, + { + "epoch": 0.05, + "grad_norm": 1.383893468736846, + "learning_rate": 1.9972354496555467e-05, + "loss": 0.7454, + "step": 683 + }, + { + "epoch": 0.05, + "grad_norm": 1.5615148872107998, + "learning_rate": 1.997216747392664e-05, + "loss": 0.7026, + "step": 684 + }, + { + "epoch": 0.05, + "grad_norm": 1.4900843793543455, + "learning_rate": 1.9971979821703437e-05, + "loss": 0.7047, + "step": 685 + }, + { + "epoch": 0.05, + "grad_norm": 1.4819541062470392, + "learning_rate": 1.99717915398977e-05, + "loss": 0.6941, + "step": 686 + }, + { + "epoch": 0.05, + "grad_norm": 1.4353172481897687, + "learning_rate": 1.9971602628521312e-05, + "loss": 0.6921, + "step": 687 + }, + { + "epoch": 0.05, + "grad_norm": 1.4927750474281518, + "learning_rate": 1.9971413087586207e-05, + "loss": 0.6746, + "step": 688 + }, + { + "epoch": 0.05, + "grad_norm": 1.5006879182754358, + "learning_rate": 1.9971222917104344e-05, + "loss": 0.6387, + "step": 689 + }, + { + "epoch": 0.05, + "grad_norm": 1.5965338684131045, + "learning_rate": 1.9971032117087736e-05, + "loss": 0.6591, + "step": 690 + }, + { + "epoch": 0.05, + "grad_norm": 1.5332447216441298, + "learning_rate": 1.9970840687548425e-05, + "loss": 0.6956, + "step": 691 + }, + { + "epoch": 0.05, + "grad_norm": 1.5038912685688457, + "learning_rate": 1.99706486284985e-05, + "loss": 0.7015, + "step": 692 + }, + { + "epoch": 0.05, + "grad_norm": 1.3650321756140882, + "learning_rate": 1.9970455939950085e-05, + "loss": 0.6381, + "step": 693 + }, + { + "epoch": 0.05, + "grad_norm": 1.449135667419262, + "learning_rate": 1.9970262621915348e-05, + "loss": 0.6802, + "step": 694 + }, + { + "epoch": 0.05, + "grad_norm": 1.511772371666896, + "learning_rate": 1.9970068674406487e-05, + "loss": 0.7023, + "step": 695 + }, + { + "epoch": 0.05, + "grad_norm": 1.403623331865085, + "learning_rate": 1.9969874097435754e-05, + "loss": 0.6135, + "step": 696 + }, + { + "epoch": 0.05, + "grad_norm": 1.6073687001143515, + "learning_rate": 1.996967889101543e-05, + "loss": 0.6853, + "step": 697 + }, + { + "epoch": 0.05, + "grad_norm": 1.4207683910097149, + "learning_rate": 1.9969483055157846e-05, + "loss": 0.6793, + "step": 698 + }, + { + "epoch": 0.05, + "grad_norm": 1.5201503404503816, + "learning_rate": 1.9969286589875358e-05, + "loss": 0.6931, + "step": 699 + }, + { + "epoch": 0.05, + "grad_norm": 1.413567050526946, + "learning_rate": 1.9969089495180372e-05, + "loss": 0.7178, + "step": 700 + }, + { + "epoch": 0.05, + "grad_norm": 1.458848571585027, + "learning_rate": 1.9968891771085334e-05, + "loss": 0.6818, + "step": 701 + }, + { + "epoch": 0.05, + "grad_norm": 1.5337122369546061, + "learning_rate": 1.996869341760272e-05, + "loss": 0.765, + "step": 702 + }, + { + "epoch": 0.05, + "grad_norm": 1.423220909214009, + "learning_rate": 1.9968494434745065e-05, + "loss": 0.6991, + "step": 703 + }, + { + "epoch": 0.05, + "grad_norm": 1.4333970769118678, + "learning_rate": 1.9968294822524923e-05, + "loss": 0.6886, + "step": 704 + }, + { + "epoch": 0.05, + "grad_norm": 1.4869486933144944, + "learning_rate": 1.99680945809549e-05, + "loss": 0.7294, + "step": 705 + }, + { + "epoch": 0.05, + "grad_norm": 1.447119989931308, + "learning_rate": 1.9967893710047638e-05, + "loss": 0.6627, + "step": 706 + }, + { + "epoch": 0.05, + "grad_norm": 1.54846110090175, + "learning_rate": 1.9967692209815818e-05, + "loss": 0.7537, + "step": 707 + }, + { + "epoch": 0.05, + "grad_norm": 1.429511220935334, + "learning_rate": 1.996749008027216e-05, + "loss": 0.6193, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 1.5018334310035786, + "learning_rate": 1.9967287321429432e-05, + "loss": 0.6831, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 1.5922078918409035, + "learning_rate": 1.996708393330043e-05, + "loss": 0.7098, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 1.4807718012098625, + "learning_rate": 1.9966879915897996e-05, + "loss": 0.6938, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 1.4376444536688462, + "learning_rate": 1.996667526923501e-05, + "loss": 0.7138, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 1.3909481830310941, + "learning_rate": 1.996646999332439e-05, + "loss": 0.662, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 1.4544940365026182, + "learning_rate": 1.9966264088179105e-05, + "loss": 0.6851, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 1.3572577288062038, + "learning_rate": 1.9966057553812144e-05, + "loss": 0.7007, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 1.496013748778949, + "learning_rate": 1.9965850390236554e-05, + "loss": 0.6698, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 1.488084995416177, + "learning_rate": 1.9965642597465412e-05, + "loss": 0.7788, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 1.4330696166458783, + "learning_rate": 1.9965434175511837e-05, + "loss": 0.6799, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 1.4924156453944606, + "learning_rate": 1.9965225124388982e-05, + "loss": 0.6159, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 1.533901488044266, + "learning_rate": 1.9965015444110058e-05, + "loss": 0.6911, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 1.5045640175797126, + "learning_rate": 1.9964805134688294e-05, + "loss": 0.6553, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 1.365117451432009, + "learning_rate": 1.996459419613697e-05, + "loss": 0.6434, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 1.48307486364579, + "learning_rate": 1.9964382628469403e-05, + "loss": 0.6738, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 1.4180241513529275, + "learning_rate": 1.9964170431698953e-05, + "loss": 0.6418, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 1.545127420942442, + "learning_rate": 1.9963957605839014e-05, + "loss": 0.7338, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 1.4687238650330188, + "learning_rate": 1.9963744150903026e-05, + "loss": 0.6834, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 1.47330215819562, + "learning_rate": 1.996353006690446e-05, + "loss": 0.7379, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 1.319754126942464, + "learning_rate": 1.996331535385684e-05, + "loss": 0.6928, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 1.3881803734601454, + "learning_rate": 1.9963100011773716e-05, + "loss": 0.6286, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 1.5061512230106704, + "learning_rate": 1.9962884040668686e-05, + "loss": 0.6475, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 1.4580290704034682, + "learning_rate": 1.9962667440555383e-05, + "loss": 0.6568, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 1.5634361514433177, + "learning_rate": 1.9962450211447485e-05, + "loss": 0.7047, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 1.478199450047416, + "learning_rate": 1.9962232353358707e-05, + "loss": 0.6478, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 1.433620691695921, + "learning_rate": 1.99620138663028e-05, + "loss": 0.6053, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 1.5141799172616108, + "learning_rate": 1.9961794750293558e-05, + "loss": 0.6871, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 1.5598473729761482, + "learning_rate": 1.9961575005344822e-05, + "loss": 0.7024, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 1.3240783076797853, + "learning_rate": 1.996135463147046e-05, + "loss": 0.6251, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 1.350872767863367, + "learning_rate": 1.9961133628684382e-05, + "loss": 0.6219, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 1.6001908546105366, + "learning_rate": 1.996091199700055e-05, + "loss": 0.7471, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 1.3954985012974443, + "learning_rate": 1.9960689736432952e-05, + "loss": 0.6987, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 1.412274109661438, + "learning_rate": 1.996046684699562e-05, + "loss": 0.672, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 1.5066987504890756, + "learning_rate": 1.9960243328702628e-05, + "loss": 0.6547, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 1.4124857083163054, + "learning_rate": 1.9960019181568082e-05, + "loss": 0.6698, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 1.397114388863728, + "learning_rate": 1.995979440560614e-05, + "loss": 0.7203, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 1.417821030501763, + "learning_rate": 1.9959569000830993e-05, + "loss": 0.7162, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 1.409900420169347, + "learning_rate": 1.995934296725687e-05, + "loss": 0.6632, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 1.5140070980868126, + "learning_rate": 1.9959116304898045e-05, + "loss": 0.7177, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 1.5410523694278575, + "learning_rate": 1.995888901376882e-05, + "loss": 0.6075, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 1.4903310841869246, + "learning_rate": 1.9958661093883552e-05, + "loss": 0.6364, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 1.5639198718362435, + "learning_rate": 1.995843254525663e-05, + "loss": 0.7242, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 1.4392599858767008, + "learning_rate": 1.9958203367902482e-05, + "loss": 0.657, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 1.372316950558404, + "learning_rate": 1.995797356183558e-05, + "loss": 0.6053, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 1.4731749574770556, + "learning_rate": 1.9957743127070427e-05, + "loss": 0.6608, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 1.5437065581722909, + "learning_rate": 1.995751206362158e-05, + "loss": 0.6605, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 1.4941770062972664, + "learning_rate": 1.9957280371503617e-05, + "loss": 0.7149, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 1.5275310586871569, + "learning_rate": 1.9957048050731175e-05, + "loss": 0.6705, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 1.3402932972687553, + "learning_rate": 1.9956815101318916e-05, + "loss": 0.6265, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 1.4207912968358263, + "learning_rate": 1.995658152328155e-05, + "loss": 0.708, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 1.4574596603595302, + "learning_rate": 1.9956347316633824e-05, + "loss": 0.6417, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 1.3379912178481204, + "learning_rate": 1.995611248139052e-05, + "loss": 0.6392, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 1.4350808561453696, + "learning_rate": 1.995587701756647e-05, + "loss": 0.6429, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 1.4937766834892385, + "learning_rate": 1.9955640925176543e-05, + "loss": 0.7354, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 1.457003268466653, + "learning_rate": 1.995540420423564e-05, + "loss": 0.6066, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 1.5086050371945936, + "learning_rate": 1.99551668547587e-05, + "loss": 0.7269, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 1.578153931003182, + "learning_rate": 1.9954928876760718e-05, + "loss": 0.7003, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 1.5626557844703117, + "learning_rate": 1.9954690270256717e-05, + "loss": 0.741, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 1.3792139417876308, + "learning_rate": 1.9954451035261754e-05, + "loss": 0.6957, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 1.4373695480199493, + "learning_rate": 1.9954211171790946e-05, + "loss": 0.6691, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 1.5044349434688922, + "learning_rate": 1.9953970679859425e-05, + "loss": 0.7092, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 1.4607424682339447, + "learning_rate": 1.9953729559482383e-05, + "loss": 0.6683, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 1.4423507449253523, + "learning_rate": 1.9953487810675036e-05, + "loss": 0.7058, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 1.2487904895866315, + "learning_rate": 1.995324543345265e-05, + "loss": 0.6222, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 1.401285308678884, + "learning_rate": 1.995300242783053e-05, + "loss": 0.6654, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 1.3711359609860783, + "learning_rate": 1.9952758793824016e-05, + "loss": 0.6917, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 1.5597283767544046, + "learning_rate": 1.995251453144849e-05, + "loss": 0.7009, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 1.412779674017897, + "learning_rate": 1.995226964071937e-05, + "loss": 0.7018, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 1.3907166782976859, + "learning_rate": 1.9952024121652122e-05, + "loss": 0.6661, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 1.3705489769068018, + "learning_rate": 1.9951777974262247e-05, + "loss": 0.7182, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 1.4892289956228386, + "learning_rate": 1.9951531198565287e-05, + "loss": 0.7256, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 1.4639856698997271, + "learning_rate": 1.9951283794576814e-05, + "loss": 0.6965, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 1.421904008171216, + "learning_rate": 1.9951035762312453e-05, + "loss": 0.6684, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 1.4630548973187436, + "learning_rate": 1.995078710178787e-05, + "loss": 0.7054, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 1.4092327274484135, + "learning_rate": 1.9950537813018753e-05, + "loss": 0.6435, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 1.5645408272419137, + "learning_rate": 1.9950287896020846e-05, + "loss": 0.6852, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 1.4678855949590572, + "learning_rate": 1.995003735080993e-05, + "loss": 0.7105, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 1.5484468584903797, + "learning_rate": 1.9949786177401816e-05, + "loss": 0.7102, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 1.4400869394919489, + "learning_rate": 1.994953437581237e-05, + "loss": 0.6719, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 1.5901981493001038, + "learning_rate": 1.9949281946057482e-05, + "loss": 0.724, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 1.4651656547342995, + "learning_rate": 1.99490288881531e-05, + "loss": 0.6788, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 1.3357217179420204, + "learning_rate": 1.994877520211519e-05, + "loss": 0.7071, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 1.4526680668886418, + "learning_rate": 1.9948520887959772e-05, + "loss": 0.6997, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 1.4834590294261867, + "learning_rate": 1.9948265945702905e-05, + "loss": 0.6164, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 1.429815151164952, + "learning_rate": 1.994801037536068e-05, + "loss": 0.7065, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 1.4708895419495984, + "learning_rate": 1.9947754176949238e-05, + "loss": 0.7069, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 1.3637496449638764, + "learning_rate": 1.9947497350484748e-05, + "loss": 0.6075, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 1.487163694816019, + "learning_rate": 1.994723989598343e-05, + "loss": 0.6359, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 1.314120518404876, + "learning_rate": 1.9946981813461534e-05, + "loss": 0.6578, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 1.4543325677508736, + "learning_rate": 1.994672310293536e-05, + "loss": 0.7004, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 1.4491741656825412, + "learning_rate": 1.9946463764421236e-05, + "loss": 0.7365, + "step": 799 + }, + { + "epoch": 0.06, + "grad_norm": 1.5457196014736971, + "learning_rate": 1.994620379793554e-05, + "loss": 0.7442, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 1.5056932013786395, + "learning_rate": 1.9945943203494677e-05, + "loss": 0.6771, + "step": 801 + }, + { + "epoch": 0.06, + "grad_norm": 1.4700297198243129, + "learning_rate": 1.994568198111511e-05, + "loss": 0.5979, + "step": 802 + }, + { + "epoch": 0.06, + "grad_norm": 1.419040999628735, + "learning_rate": 1.9945420130813327e-05, + "loss": 0.7499, + "step": 803 + }, + { + "epoch": 0.06, + "grad_norm": 1.4389450439499267, + "learning_rate": 1.9945157652605854e-05, + "loss": 0.6901, + "step": 804 + }, + { + "epoch": 0.06, + "grad_norm": 1.480339024479065, + "learning_rate": 1.9944894546509276e-05, + "loss": 0.7266, + "step": 805 + }, + { + "epoch": 0.06, + "grad_norm": 1.4854144823540691, + "learning_rate": 1.9944630812540188e-05, + "loss": 0.6809, + "step": 806 + }, + { + "epoch": 0.06, + "grad_norm": 1.62295579712116, + "learning_rate": 1.9944366450715256e-05, + "loss": 0.695, + "step": 807 + }, + { + "epoch": 0.06, + "grad_norm": 1.3998409453367233, + "learning_rate": 1.994410146105116e-05, + "loss": 0.6088, + "step": 808 + }, + { + "epoch": 0.06, + "grad_norm": 1.3946552381773603, + "learning_rate": 1.9943835843564635e-05, + "loss": 0.6593, + "step": 809 + }, + { + "epoch": 0.06, + "grad_norm": 1.3762848226522915, + "learning_rate": 1.994356959827245e-05, + "loss": 0.7106, + "step": 810 + }, + { + "epoch": 0.06, + "grad_norm": 1.3314331591335313, + "learning_rate": 1.9943302725191416e-05, + "loss": 0.6667, + "step": 811 + }, + { + "epoch": 0.06, + "grad_norm": 1.543037432337072, + "learning_rate": 1.9943035224338375e-05, + "loss": 0.7705, + "step": 812 + }, + { + "epoch": 0.06, + "grad_norm": 1.315013884917083, + "learning_rate": 1.9942767095730225e-05, + "loss": 0.6175, + "step": 813 + }, + { + "epoch": 0.06, + "grad_norm": 1.404428489151962, + "learning_rate": 1.994249833938389e-05, + "loss": 0.69, + "step": 814 + }, + { + "epoch": 0.06, + "grad_norm": 1.3887899914605013, + "learning_rate": 1.9942228955316342e-05, + "loss": 0.6877, + "step": 815 + }, + { + "epoch": 0.06, + "grad_norm": 1.5660911490464346, + "learning_rate": 1.994195894354458e-05, + "loss": 0.7207, + "step": 816 + }, + { + "epoch": 0.06, + "grad_norm": 1.315405717275919, + "learning_rate": 1.9941688304085654e-05, + "loss": 0.6147, + "step": 817 + }, + { + "epoch": 0.06, + "grad_norm": 1.606085517269158, + "learning_rate": 1.994141703695666e-05, + "loss": 0.6923, + "step": 818 + }, + { + "epoch": 0.06, + "grad_norm": 1.495239492007301, + "learning_rate": 1.994114514217471e-05, + "loss": 0.6948, + "step": 819 + }, + { + "epoch": 0.06, + "grad_norm": 1.4803970037704692, + "learning_rate": 1.994087261975698e-05, + "loss": 0.7193, + "step": 820 + }, + { + "epoch": 0.06, + "grad_norm": 1.4623073744465764, + "learning_rate": 1.9940599469720675e-05, + "loss": 0.6743, + "step": 821 + }, + { + "epoch": 0.06, + "grad_norm": 1.3700775118988122, + "learning_rate": 1.994032569208304e-05, + "loss": 0.6744, + "step": 822 + }, + { + "epoch": 0.06, + "grad_norm": 1.4744522920524408, + "learning_rate": 1.9940051286861357e-05, + "loss": 0.6349, + "step": 823 + }, + { + "epoch": 0.06, + "grad_norm": 1.3809143228133267, + "learning_rate": 1.993977625407295e-05, + "loss": 0.7152, + "step": 824 + }, + { + "epoch": 0.06, + "grad_norm": 1.3811249861954153, + "learning_rate": 1.993950059373519e-05, + "loss": 0.6776, + "step": 825 + }, + { + "epoch": 0.06, + "grad_norm": 1.387332447348377, + "learning_rate": 1.9939224305865474e-05, + "loss": 0.6825, + "step": 826 + }, + { + "epoch": 0.06, + "grad_norm": 1.4376772895619236, + "learning_rate": 1.993894739048125e-05, + "loss": 0.7792, + "step": 827 + }, + { + "epoch": 0.06, + "grad_norm": 1.4289809185059477, + "learning_rate": 1.9938669847599996e-05, + "loss": 0.7142, + "step": 828 + }, + { + "epoch": 0.06, + "grad_norm": 1.429826656719188, + "learning_rate": 1.993839167723924e-05, + "loss": 0.7495, + "step": 829 + }, + { + "epoch": 0.06, + "grad_norm": 1.4351355138452868, + "learning_rate": 1.993811287941654e-05, + "loss": 0.698, + "step": 830 + }, + { + "epoch": 0.06, + "grad_norm": 1.6273453000519578, + "learning_rate": 1.9937833454149503e-05, + "loss": 0.6459, + "step": 831 + }, + { + "epoch": 0.06, + "grad_norm": 1.4846628010469245, + "learning_rate": 1.9937553401455763e-05, + "loss": 0.6972, + "step": 832 + }, + { + "epoch": 0.06, + "grad_norm": 1.3958818749864583, + "learning_rate": 1.993727272135301e-05, + "loss": 0.6921, + "step": 833 + }, + { + "epoch": 0.06, + "grad_norm": 1.493055560484105, + "learning_rate": 1.993699141385896e-05, + "loss": 0.7243, + "step": 834 + }, + { + "epoch": 0.06, + "grad_norm": 1.5428751856768164, + "learning_rate": 1.9936709478991368e-05, + "loss": 0.6991, + "step": 835 + }, + { + "epoch": 0.06, + "grad_norm": 1.433996824721222, + "learning_rate": 1.9936426916768047e-05, + "loss": 0.6577, + "step": 836 + }, + { + "epoch": 0.06, + "grad_norm": 1.406986721759708, + "learning_rate": 1.9936143727206826e-05, + "loss": 0.6756, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 1.4491021862605051, + "learning_rate": 1.993585991032559e-05, + "loss": 0.6922, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 1.3861849334966856, + "learning_rate": 1.9935575466142256e-05, + "loss": 0.6396, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 1.4831036393419366, + "learning_rate": 1.993529039467478e-05, + "loss": 0.7079, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 1.262685069125964, + "learning_rate": 1.9935004695941164e-05, + "loss": 0.6484, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 1.4497742937620992, + "learning_rate": 1.9934718369959438e-05, + "loss": 0.7075, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 1.375661474123054, + "learning_rate": 1.993443141674769e-05, + "loss": 0.6859, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 1.5053634915575964, + "learning_rate": 1.9934143836324032e-05, + "loss": 0.6376, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 1.4212393545055704, + "learning_rate": 1.9933855628706616e-05, + "loss": 0.7012, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 1.4160093951910373, + "learning_rate": 1.9933566793913646e-05, + "loss": 0.6748, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 1.441844015660788, + "learning_rate": 1.9933277331963354e-05, + "loss": 0.7426, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 1.4391726009568306, + "learning_rate": 1.9932987242874014e-05, + "loss": 0.6829, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 1.4962067324852635, + "learning_rate": 1.993269652666394e-05, + "loss": 0.6847, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 1.5302027799827125, + "learning_rate": 1.9932405183351492e-05, + "loss": 0.7161, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 1.3747106594485758, + "learning_rate": 1.993211321295506e-05, + "loss": 0.6486, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 1.3602210295105055, + "learning_rate": 1.9931820615493075e-05, + "loss": 0.706, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 1.5689407354264588, + "learning_rate": 1.9931527390984016e-05, + "loss": 0.7051, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 1.4528060891282195, + "learning_rate": 1.993123353944639e-05, + "loss": 0.7376, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 1.585080897315511, + "learning_rate": 1.9930939060898754e-05, + "loss": 0.6934, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 1.3884297735294173, + "learning_rate": 1.99306439553597e-05, + "loss": 0.6519, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 1.270608063832158, + "learning_rate": 1.9930348222847856e-05, + "loss": 0.6779, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 1.2991058740998236, + "learning_rate": 1.9930051863381893e-05, + "loss": 0.6505, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 1.3009620976098495, + "learning_rate": 1.9929754876980523e-05, + "loss": 0.5922, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 1.3634349222659414, + "learning_rate": 1.99294572636625e-05, + "loss": 0.6267, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 1.3665605688005118, + "learning_rate": 1.992915902344661e-05, + "loss": 0.6439, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 1.4513939832550313, + "learning_rate": 1.9928860156351683e-05, + "loss": 0.7101, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 1.3809437597840335, + "learning_rate": 1.9928560662396585e-05, + "loss": 0.6228, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 1.5473122797487913, + "learning_rate": 1.9928260541600233e-05, + "loss": 0.6886, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 1.5049679662140467, + "learning_rate": 1.9927959793981567e-05, + "loss": 0.7393, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 1.4529344167118305, + "learning_rate": 1.9927658419559577e-05, + "loss": 0.6844, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 1.490682991867091, + "learning_rate": 1.992735641835329e-05, + "loss": 0.7413, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 1.5769126692244164, + "learning_rate": 1.9927053790381777e-05, + "loss": 0.7244, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 1.6577558329799882, + "learning_rate": 1.9926750535664138e-05, + "loss": 0.6856, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.474557554837615, + "learning_rate": 1.9926446654219526e-05, + "loss": 0.6884, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 1.3771862941846595, + "learning_rate": 1.9926142146067122e-05, + "loss": 0.6543, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 1.4503339810333056, + "learning_rate": 1.9925837011226154e-05, + "loss": 0.6353, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 1.4502429890100663, + "learning_rate": 1.9925531249715883e-05, + "loss": 0.7419, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 1.4198792253094388, + "learning_rate": 1.9925224861555614e-05, + "loss": 0.694, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 1.459053742624924, + "learning_rate": 1.992491784676469e-05, + "loss": 0.6742, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 1.4462206487553946, + "learning_rate": 1.99246102053625e-05, + "loss": 0.6917, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 1.4885586857962874, + "learning_rate": 1.9924301937368463e-05, + "loss": 0.7073, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 1.514222981202289, + "learning_rate": 1.992399304280204e-05, + "loss": 0.7174, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 1.572562129335279, + "learning_rate": 1.992368352168274e-05, + "loss": 0.7322, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 1.4745071880984189, + "learning_rate": 1.9923373374030098e-05, + "loss": 0.6473, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 1.4927841511558657, + "learning_rate": 1.9923062599863694e-05, + "loss": 0.7284, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 1.4004572768689971, + "learning_rate": 1.9922751199203153e-05, + "loss": 0.694, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 1.3811435865604569, + "learning_rate": 1.9922439172068134e-05, + "loss": 0.6778, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 1.6417994337404593, + "learning_rate": 1.9922126518478338e-05, + "loss": 0.7517, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 1.3503645510662545, + "learning_rate": 1.9921813238453505e-05, + "loss": 0.6337, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 1.4140744287820188, + "learning_rate": 1.992149933201341e-05, + "loss": 0.6708, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 1.3250569079432117, + "learning_rate": 1.9921184799177874e-05, + "loss": 0.5896, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 1.6496578237440829, + "learning_rate": 1.9920869639966754e-05, + "loss": 0.7237, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 1.530620211885439, + "learning_rate": 1.9920553854399954e-05, + "loss": 0.6639, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 1.3179543106020766, + "learning_rate": 1.9920237442497402e-05, + "loss": 0.6178, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 1.386334433155643, + "learning_rate": 1.9919920404279078e-05, + "loss": 0.6786, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 1.4601150890608667, + "learning_rate": 1.9919602739765e-05, + "loss": 0.679, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 1.4218650230644563, + "learning_rate": 1.991928444897522e-05, + "loss": 0.6331, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 1.5476835045299449, + "learning_rate": 1.991896553192984e-05, + "loss": 0.663, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 1.3503831778782487, + "learning_rate": 1.9918645988648988e-05, + "loss": 0.6662, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 1.488689696966759, + "learning_rate": 1.991832581915284e-05, + "loss": 0.7097, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 1.5004022376840858, + "learning_rate": 1.991800502346162e-05, + "loss": 0.6784, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 1.5633060664962397, + "learning_rate": 1.9917683601595563e-05, + "loss": 0.7051, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 1.4859099633265669, + "learning_rate": 1.9917361553574974e-05, + "loss": 0.7132, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 1.2765667619949221, + "learning_rate": 1.9917038879420184e-05, + "loss": 0.6212, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 1.4555703373919575, + "learning_rate": 1.9916715579151567e-05, + "loss": 0.7504, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 1.4165845267474468, + "learning_rate": 1.9916391652789526e-05, + "loss": 0.659, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 1.4645597055337136, + "learning_rate": 1.9916067100354523e-05, + "loss": 0.7273, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 1.3920263023686752, + "learning_rate": 1.9915741921867043e-05, + "loss": 0.6225, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 1.4166586632595999, + "learning_rate": 1.9915416117347615e-05, + "loss": 0.6823, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 1.3831162630917924, + "learning_rate": 1.9915089686816813e-05, + "loss": 0.7372, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 1.4050313119351863, + "learning_rate": 1.991476263029524e-05, + "loss": 0.6783, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 1.4914840717903826, + "learning_rate": 1.9914434947803552e-05, + "loss": 0.6823, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 1.487423066145206, + "learning_rate": 1.9914106639362437e-05, + "loss": 0.7083, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 1.5068998589001918, + "learning_rate": 1.9913777704992613e-05, + "loss": 0.7593, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 1.3678746267863946, + "learning_rate": 1.991344814471486e-05, + "loss": 0.6488, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 1.3588297397128217, + "learning_rate": 1.9913117958549975e-05, + "loss": 0.651, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 1.40965020988121, + "learning_rate": 1.9912787146518812e-05, + "loss": 0.6457, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 1.4230327713356739, + "learning_rate": 1.991245570864225e-05, + "loss": 0.6117, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 1.3945193250130232, + "learning_rate": 1.9912123644941218e-05, + "loss": 0.6508, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 1.493799826457363, + "learning_rate": 1.9911790955436682e-05, + "loss": 0.7071, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 1.4551092566929722, + "learning_rate": 1.9911457640149642e-05, + "loss": 0.6186, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 1.4483474621158612, + "learning_rate": 1.991112369910115e-05, + "loss": 0.652, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 1.3635141780327364, + "learning_rate": 1.9910789132312278e-05, + "loss": 0.6612, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 1.4896489464596532, + "learning_rate": 1.9910453939804156e-05, + "loss": 0.6837, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 1.4126587955708032, + "learning_rate": 1.991011812159795e-05, + "loss": 0.6654, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 1.8112980375902783, + "learning_rate": 1.9909781677714854e-05, + "loss": 0.6903, + "step": 922 + }, + { + "epoch": 0.07, + "grad_norm": 1.541463383168267, + "learning_rate": 1.9909444608176117e-05, + "loss": 0.6427, + "step": 923 + }, + { + "epoch": 0.07, + "grad_norm": 1.3863376577333875, + "learning_rate": 1.9909106913003013e-05, + "loss": 0.6712, + "step": 924 + }, + { + "epoch": 0.07, + "grad_norm": 1.5331110641150918, + "learning_rate": 1.9908768592216862e-05, + "loss": 0.7042, + "step": 925 + }, + { + "epoch": 0.07, + "grad_norm": 1.4753000827000902, + "learning_rate": 1.990842964583903e-05, + "loss": 0.6313, + "step": 926 + }, + { + "epoch": 0.07, + "grad_norm": 1.440826382080631, + "learning_rate": 1.9908090073890915e-05, + "loss": 0.7069, + "step": 927 + }, + { + "epoch": 0.07, + "grad_norm": 1.3562332328076356, + "learning_rate": 1.9907749876393955e-05, + "loss": 0.654, + "step": 928 + }, + { + "epoch": 0.07, + "grad_norm": 1.4162225401085708, + "learning_rate": 1.9907409053369625e-05, + "loss": 0.6433, + "step": 929 + }, + { + "epoch": 0.07, + "grad_norm": 1.3852365228465156, + "learning_rate": 1.990706760483945e-05, + "loss": 0.6739, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 1.4268325900953065, + "learning_rate": 1.990672553082498e-05, + "loss": 0.7006, + "step": 931 + }, + { + "epoch": 0.07, + "grad_norm": 1.4229980895993992, + "learning_rate": 1.9906382831347815e-05, + "loss": 0.7259, + "step": 932 + }, + { + "epoch": 0.07, + "grad_norm": 1.429748533853472, + "learning_rate": 1.9906039506429594e-05, + "loss": 0.7383, + "step": 933 + }, + { + "epoch": 0.07, + "grad_norm": 1.3603332036320621, + "learning_rate": 1.990569555609199e-05, + "loss": 0.6462, + "step": 934 + }, + { + "epoch": 0.07, + "grad_norm": 1.5254504867315368, + "learning_rate": 1.9905350980356717e-05, + "loss": 0.7234, + "step": 935 + }, + { + "epoch": 0.07, + "grad_norm": 1.3442592543002676, + "learning_rate": 1.990500577924553e-05, + "loss": 0.6751, + "step": 936 + }, + { + "epoch": 0.07, + "grad_norm": 1.3598834435219844, + "learning_rate": 1.990465995278023e-05, + "loss": 0.7251, + "step": 937 + }, + { + "epoch": 0.07, + "grad_norm": 1.3867053823767008, + "learning_rate": 1.9904313500982645e-05, + "loss": 0.7055, + "step": 938 + }, + { + "epoch": 0.07, + "grad_norm": 1.2992096992952753, + "learning_rate": 1.9903966423874648e-05, + "loss": 0.6868, + "step": 939 + }, + { + "epoch": 0.07, + "grad_norm": 1.4236460963139852, + "learning_rate": 1.9903618721478154e-05, + "loss": 0.6872, + "step": 940 + }, + { + "epoch": 0.07, + "grad_norm": 1.3349318706972957, + "learning_rate": 1.9903270393815112e-05, + "loss": 0.6536, + "step": 941 + }, + { + "epoch": 0.07, + "grad_norm": 1.5064617688746478, + "learning_rate": 1.9902921440907517e-05, + "loss": 0.7324, + "step": 942 + }, + { + "epoch": 0.07, + "grad_norm": 1.4943537940256149, + "learning_rate": 1.99025718627774e-05, + "loss": 0.7533, + "step": 943 + }, + { + "epoch": 0.07, + "grad_norm": 1.326285490806938, + "learning_rate": 1.990222165944683e-05, + "loss": 0.6096, + "step": 944 + }, + { + "epoch": 0.07, + "grad_norm": 1.4308625275631561, + "learning_rate": 1.9901870830937916e-05, + "loss": 0.7198, + "step": 945 + }, + { + "epoch": 0.07, + "grad_norm": 1.4860911036221065, + "learning_rate": 1.9901519377272813e-05, + "loss": 0.7275, + "step": 946 + }, + { + "epoch": 0.07, + "grad_norm": 1.3758716421418185, + "learning_rate": 1.9901167298473704e-05, + "loss": 0.6426, + "step": 947 + }, + { + "epoch": 0.07, + "grad_norm": 1.3939826377734255, + "learning_rate": 1.990081459456282e-05, + "loss": 0.6446, + "step": 948 + }, + { + "epoch": 0.07, + "grad_norm": 1.4762685170251857, + "learning_rate": 1.990046126556243e-05, + "loss": 0.6721, + "step": 949 + }, + { + "epoch": 0.07, + "grad_norm": 1.316357122121463, + "learning_rate": 1.9900107311494844e-05, + "loss": 0.6468, + "step": 950 + }, + { + "epoch": 0.07, + "grad_norm": 1.4429846084789009, + "learning_rate": 1.98997527323824e-05, + "loss": 0.7011, + "step": 951 + }, + { + "epoch": 0.07, + "grad_norm": 1.4064551733598605, + "learning_rate": 1.9899397528247496e-05, + "loss": 0.687, + "step": 952 + }, + { + "epoch": 0.07, + "grad_norm": 1.4627218518349863, + "learning_rate": 1.9899041699112547e-05, + "loss": 0.6897, + "step": 953 + }, + { + "epoch": 0.07, + "grad_norm": 1.3036922382056675, + "learning_rate": 1.9898685245000023e-05, + "loss": 0.6743, + "step": 954 + }, + { + "epoch": 0.07, + "grad_norm": 1.487256195219612, + "learning_rate": 1.9898328165932432e-05, + "loss": 0.6504, + "step": 955 + }, + { + "epoch": 0.07, + "grad_norm": 1.488864574107309, + "learning_rate": 1.989797046193231e-05, + "loss": 0.6602, + "step": 956 + }, + { + "epoch": 0.07, + "grad_norm": 1.4369606996708997, + "learning_rate": 1.9897612133022252e-05, + "loss": 0.6301, + "step": 957 + }, + { + "epoch": 0.07, + "grad_norm": 1.3805988934778417, + "learning_rate": 1.9897253179224872e-05, + "loss": 0.6489, + "step": 958 + }, + { + "epoch": 0.07, + "grad_norm": 1.24645412101121, + "learning_rate": 1.9896893600562836e-05, + "loss": 0.6412, + "step": 959 + }, + { + "epoch": 0.07, + "grad_norm": 1.5830254339029253, + "learning_rate": 1.9896533397058846e-05, + "loss": 0.7216, + "step": 960 + }, + { + "epoch": 0.07, + "grad_norm": 1.4225778205113302, + "learning_rate": 1.9896172568735642e-05, + "loss": 0.6761, + "step": 961 + }, + { + "epoch": 0.07, + "grad_norm": 1.425753825720842, + "learning_rate": 1.989581111561601e-05, + "loss": 0.6642, + "step": 962 + }, + { + "epoch": 0.07, + "grad_norm": 1.3112592736453563, + "learning_rate": 1.989544903772276e-05, + "loss": 0.6485, + "step": 963 + }, + { + "epoch": 0.07, + "grad_norm": 1.3623783652231403, + "learning_rate": 1.9895086335078767e-05, + "loss": 0.6011, + "step": 964 + }, + { + "epoch": 0.07, + "grad_norm": 1.446612129472585, + "learning_rate": 1.9894723007706916e-05, + "loss": 0.7153, + "step": 965 + }, + { + "epoch": 0.07, + "grad_norm": 1.330918549301755, + "learning_rate": 1.989435905563015e-05, + "loss": 0.679, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 1.3840427421904582, + "learning_rate": 1.9893994478871456e-05, + "loss": 0.6383, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 1.3659906434678413, + "learning_rate": 1.989362927745384e-05, + "loss": 0.7313, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 1.4293000576987829, + "learning_rate": 1.989326345140036e-05, + "loss": 0.6224, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 1.3549320796539817, + "learning_rate": 1.9892897000734122e-05, + "loss": 0.6781, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 1.7309566869049375, + "learning_rate": 1.989252992547825e-05, + "loss": 0.5915, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 1.471031932372707, + "learning_rate": 1.989216222565593e-05, + "loss": 0.6853, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 1.29330896240718, + "learning_rate": 1.9891793901290367e-05, + "loss": 0.5932, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 1.39955222598446, + "learning_rate": 1.9891424952404825e-05, + "loss": 0.6759, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091365314821897, + "learning_rate": 1.989105537902259e-05, + "loss": 0.6615, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 1.3550494862067861, + "learning_rate": 1.9890685181166996e-05, + "loss": 0.6901, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 1.4487513690774159, + "learning_rate": 1.989031435886142e-05, + "loss": 0.6796, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 1.5008200152433027, + "learning_rate": 1.988994291212927e-05, + "loss": 0.6849, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 1.474959214503317, + "learning_rate": 1.9889570840994e-05, + "loss": 0.6223, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 1.4913596208415856, + "learning_rate": 1.98891981454791e-05, + "loss": 0.7232, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.2386380232194563, + "learning_rate": 1.9888824825608097e-05, + "loss": 0.6471, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 1.5064282165451475, + "learning_rate": 1.9888450881404563e-05, + "loss": 0.7048, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 1.4884845264413473, + "learning_rate": 1.988807631289211e-05, + "loss": 0.7184, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 1.3160290729563668, + "learning_rate": 1.988770112009438e-05, + "loss": 0.6145, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 1.4277086581772478, + "learning_rate": 1.9887325303035063e-05, + "loss": 0.7313, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 1.3675087687544234, + "learning_rate": 1.9886948861737892e-05, + "loss": 0.6608, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 1.531677342164226, + "learning_rate": 1.988657179622663e-05, + "loss": 0.7273, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 1.4073944097587059, + "learning_rate": 1.9886194106525082e-05, + "loss": 0.7083, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 1.3219467522376414, + "learning_rate": 1.988581579265709e-05, + "loss": 0.6774, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 1.4440426583527428, + "learning_rate": 1.9885436854646546e-05, + "loss": 0.6863, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 1.439198527083775, + "learning_rate": 1.9885057292517373e-05, + "loss": 0.6843, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 1.2199872901129378, + "learning_rate": 1.9884677106293528e-05, + "loss": 0.643, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 1.3182833275230958, + "learning_rate": 1.9884296295999022e-05, + "loss": 0.6841, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 1.3048183809522353, + "learning_rate": 1.9883914861657895e-05, + "loss": 0.6242, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 1.4444066688700568, + "learning_rate": 1.988353280329423e-05, + "loss": 0.7121, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 1.382637012790591, + "learning_rate": 1.9883150120932144e-05, + "loss": 0.6451, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 1.3578731966664257, + "learning_rate": 1.98827668145958e-05, + "loss": 0.6442, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 1.3944573476320203, + "learning_rate": 1.9882382884309402e-05, + "loss": 0.7148, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 1.3060491161019383, + "learning_rate": 1.9881998330097184e-05, + "loss": 0.6531, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 1.4090578657062638, + "learning_rate": 1.9881613151983425e-05, + "loss": 0.7014, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 1.3586187341727907, + "learning_rate": 1.9881227349992448e-05, + "loss": 0.6605, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 1.459980044032585, + "learning_rate": 1.9880840924148606e-05, + "loss": 0.7058, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 1.2951328571759797, + "learning_rate": 1.9880453874476302e-05, + "loss": 0.6291, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 1.3904379815348435, + "learning_rate": 1.9880066200999963e-05, + "loss": 0.6767, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 1.5176683789372114, + "learning_rate": 1.9879677903744076e-05, + "loss": 0.7051, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 1.4120921955672878, + "learning_rate": 1.9879288982733146e-05, + "loss": 0.6749, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 1.3639943790795412, + "learning_rate": 1.9878899437991736e-05, + "loss": 0.6627, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 1.333326945686934, + "learning_rate": 1.987850926954443e-05, + "loss": 0.6463, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 1.433813426469793, + "learning_rate": 1.987811847741587e-05, + "loss": 0.6474, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 1.4291352419646783, + "learning_rate": 1.9877727061630732e-05, + "loss": 0.6823, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 1.318844808994173, + "learning_rate": 1.9877335022213716e-05, + "loss": 0.6547, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 1.3555024772207713, + "learning_rate": 1.987694235918958e-05, + "loss": 0.6181, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 1.3196808246332612, + "learning_rate": 1.9876549072583116e-05, + "loss": 0.6266, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 1.2748886452460877, + "learning_rate": 1.9876155162419155e-05, + "loss": 0.6764, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 1.5086678564280764, + "learning_rate": 1.987576062872256e-05, + "loss": 0.6756, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 1.4816374581843959, + "learning_rate": 1.9875365471518248e-05, + "loss": 0.672, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 1.3192516782835608, + "learning_rate": 1.987496969083116e-05, + "loss": 0.6507, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 1.453259615661739, + "learning_rate": 1.987457328668629e-05, + "loss": 0.6735, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 1.4272838460945279, + "learning_rate": 1.987417625910866e-05, + "loss": 0.6806, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 1.4206384165273391, + "learning_rate": 1.987377860812334e-05, + "loss": 0.7037, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 1.6001073413843292, + "learning_rate": 1.9873380333755437e-05, + "loss": 0.684, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 1.3913517028294793, + "learning_rate": 1.987298143603009e-05, + "loss": 0.7063, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 1.2660852702051149, + "learning_rate": 1.9872581914972486e-05, + "loss": 0.583, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 1.3821509982728633, + "learning_rate": 1.9872181770607852e-05, + "loss": 0.6473, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 1.4060382683621004, + "learning_rate": 1.987178100296145e-05, + "loss": 0.6761, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 1.4819007731641538, + "learning_rate": 1.987137961205858e-05, + "loss": 0.7451, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 1.3173853009584309, + "learning_rate": 1.9870977597924586e-05, + "loss": 0.6535, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 1.3595085078482159, + "learning_rate": 1.987057496058485e-05, + "loss": 0.6398, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 1.306198615564732, + "learning_rate": 1.987017170006479e-05, + "loss": 0.615, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 1.4706134743488104, + "learning_rate": 1.9869767816389867e-05, + "loss": 0.702, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 1.3656120160007177, + "learning_rate": 1.986936330958558e-05, + "loss": 0.6992, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 1.498139499153313, + "learning_rate": 1.986895817967747e-05, + "loss": 0.6588, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 1.5089207662733624, + "learning_rate": 1.9868552426691113e-05, + "loss": 0.6933, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 1.4074484063240944, + "learning_rate": 1.9868146050652123e-05, + "loss": 0.6611, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 1.4470945352387474, + "learning_rate": 1.9867739051586163e-05, + "loss": 0.6358, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 1.29824660053539, + "learning_rate": 1.9867331429518922e-05, + "loss": 0.6703, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 1.3373726614826371, + "learning_rate": 1.9866923184476143e-05, + "loss": 0.652, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 1.6000545194397107, + "learning_rate": 1.9866514316483597e-05, + "loss": 0.7112, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 1.4534900985973662, + "learning_rate": 1.9866104825567096e-05, + "loss": 0.6566, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 1.4244360862043022, + "learning_rate": 1.9865694711752498e-05, + "loss": 0.6325, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 1.4065804411161942, + "learning_rate": 1.986528397506569e-05, + "loss": 0.6604, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 1.2632019489902635, + "learning_rate": 1.9864872615532605e-05, + "loss": 0.6187, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 1.401284585571879, + "learning_rate": 1.9864460633179215e-05, + "loss": 0.6613, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 1.6324542807096245, + "learning_rate": 1.9864048028031535e-05, + "loss": 0.6717, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 1.4310049855100742, + "learning_rate": 1.9863634800115606e-05, + "loss": 0.6598, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 1.420662667026436, + "learning_rate": 1.9863220949457528e-05, + "loss": 0.6392, + "step": 1046 + }, + { + "epoch": 0.08, + "grad_norm": 1.423003283536424, + "learning_rate": 1.9862806476083422e-05, + "loss": 0.6136, + "step": 1047 + }, + { + "epoch": 0.08, + "grad_norm": 1.3952984662861372, + "learning_rate": 1.9862391380019458e-05, + "loss": 0.7323, + "step": 1048 + }, + { + "epoch": 0.08, + "grad_norm": 1.4141107624732887, + "learning_rate": 1.9861975661291845e-05, + "loss": 0.6745, + "step": 1049 + }, + { + "epoch": 0.08, + "grad_norm": 1.3056460759501103, + "learning_rate": 1.9861559319926825e-05, + "loss": 0.6908, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 1.309622289207852, + "learning_rate": 1.9861142355950685e-05, + "loss": 0.6436, + "step": 1051 + }, + { + "epoch": 0.08, + "grad_norm": 1.333428642720928, + "learning_rate": 1.9860724769389754e-05, + "loss": 0.6094, + "step": 1052 + }, + { + "epoch": 0.08, + "grad_norm": 1.4631047620518394, + "learning_rate": 1.986030656027039e-05, + "loss": 0.6917, + "step": 1053 + }, + { + "epoch": 0.08, + "grad_norm": 1.3177733524650974, + "learning_rate": 1.9859887728619002e-05, + "loss": 0.6302, + "step": 1054 + }, + { + "epoch": 0.08, + "grad_norm": 1.44428839614962, + "learning_rate": 1.9859468274462034e-05, + "loss": 0.647, + "step": 1055 + }, + { + "epoch": 0.08, + "grad_norm": 1.386861702441382, + "learning_rate": 1.9859048197825963e-05, + "loss": 0.6952, + "step": 1056 + }, + { + "epoch": 0.08, + "grad_norm": 1.4660066185056326, + "learning_rate": 1.9858627498737313e-05, + "loss": 0.7264, + "step": 1057 + }, + { + "epoch": 0.08, + "grad_norm": 1.3794503151745687, + "learning_rate": 1.9858206177222646e-05, + "loss": 0.5743, + "step": 1058 + }, + { + "epoch": 0.08, + "grad_norm": 1.447135888589788, + "learning_rate": 1.9857784233308562e-05, + "loss": 0.6745, + "step": 1059 + }, + { + "epoch": 0.08, + "grad_norm": 1.3665941967396986, + "learning_rate": 1.9857361667021696e-05, + "loss": 0.5913, + "step": 1060 + }, + { + "epoch": 0.08, + "grad_norm": 1.3165333400696055, + "learning_rate": 1.9856938478388735e-05, + "loss": 0.6582, + "step": 1061 + }, + { + "epoch": 0.08, + "grad_norm": 1.3262803675188708, + "learning_rate": 1.9856514667436393e-05, + "loss": 0.6925, + "step": 1062 + }, + { + "epoch": 0.08, + "grad_norm": 1.5519308062200847, + "learning_rate": 1.9856090234191424e-05, + "loss": 0.6948, + "step": 1063 + }, + { + "epoch": 0.08, + "grad_norm": 1.5297323218413756, + "learning_rate": 1.985566517868063e-05, + "loss": 0.6706, + "step": 1064 + }, + { + "epoch": 0.08, + "grad_norm": 1.3614990915133625, + "learning_rate": 1.9855239500930846e-05, + "loss": 0.7215, + "step": 1065 + }, + { + "epoch": 0.08, + "grad_norm": 1.426998340089791, + "learning_rate": 1.9854813200968942e-05, + "loss": 0.6469, + "step": 1066 + }, + { + "epoch": 0.08, + "grad_norm": 1.4413188297878274, + "learning_rate": 1.985438627882184e-05, + "loss": 0.6302, + "step": 1067 + }, + { + "epoch": 0.08, + "grad_norm": 1.2690913450583652, + "learning_rate": 1.9853958734516487e-05, + "loss": 0.5659, + "step": 1068 + }, + { + "epoch": 0.08, + "grad_norm": 1.5116095293988812, + "learning_rate": 1.9853530568079882e-05, + "loss": 0.7077, + "step": 1069 + }, + { + "epoch": 0.08, + "grad_norm": 1.3332191607150499, + "learning_rate": 1.9853101779539052e-05, + "loss": 0.6324, + "step": 1070 + }, + { + "epoch": 0.08, + "grad_norm": 1.3016776813589728, + "learning_rate": 1.9852672368921074e-05, + "loss": 0.6219, + "step": 1071 + }, + { + "epoch": 0.08, + "grad_norm": 1.4503435977428494, + "learning_rate": 1.9852242336253057e-05, + "loss": 0.6758, + "step": 1072 + }, + { + "epoch": 0.08, + "grad_norm": 1.4257728889951766, + "learning_rate": 1.985181168156215e-05, + "loss": 0.6645, + "step": 1073 + }, + { + "epoch": 0.08, + "grad_norm": 1.3685159822840158, + "learning_rate": 1.985138040487554e-05, + "loss": 0.6428, + "step": 1074 + }, + { + "epoch": 0.08, + "grad_norm": 1.4001830730629623, + "learning_rate": 1.985094850622046e-05, + "loss": 0.6404, + "step": 1075 + }, + { + "epoch": 0.08, + "grad_norm": 1.5185190472606231, + "learning_rate": 1.985051598562418e-05, + "loss": 0.6866, + "step": 1076 + }, + { + "epoch": 0.08, + "grad_norm": 1.4118465110332843, + "learning_rate": 1.9850082843114e-05, + "loss": 0.621, + "step": 1077 + }, + { + "epoch": 0.08, + "grad_norm": 1.4080094245248436, + "learning_rate": 1.984964907871727e-05, + "loss": 0.7097, + "step": 1078 + }, + { + "epoch": 0.08, + "grad_norm": 1.454160669891905, + "learning_rate": 1.9849214692461375e-05, + "loss": 0.6783, + "step": 1079 + }, + { + "epoch": 0.08, + "grad_norm": 1.3085695634571022, + "learning_rate": 1.9848779684373746e-05, + "loss": 0.671, + "step": 1080 + }, + { + "epoch": 0.08, + "grad_norm": 1.4117438765349613, + "learning_rate": 1.9848344054481838e-05, + "loss": 0.6425, + "step": 1081 + }, + { + "epoch": 0.08, + "grad_norm": 1.4725912469334606, + "learning_rate": 1.9847907802813165e-05, + "loss": 0.672, + "step": 1082 + }, + { + "epoch": 0.08, + "grad_norm": 1.4635007678343672, + "learning_rate": 1.9847470929395257e-05, + "loss": 0.7695, + "step": 1083 + }, + { + "epoch": 0.08, + "grad_norm": 1.4571029195468161, + "learning_rate": 1.9847033434255708e-05, + "loss": 0.7051, + "step": 1084 + }, + { + "epoch": 0.08, + "grad_norm": 1.309554473303262, + "learning_rate": 1.984659531742213e-05, + "loss": 0.6515, + "step": 1085 + }, + { + "epoch": 0.08, + "grad_norm": 1.2763513098548585, + "learning_rate": 1.984615657892219e-05, + "loss": 0.6556, + "step": 1086 + }, + { + "epoch": 0.08, + "grad_norm": 1.3691668324268231, + "learning_rate": 1.9845717218783588e-05, + "loss": 0.6564, + "step": 1087 + }, + { + "epoch": 0.08, + "grad_norm": 1.2869796571791097, + "learning_rate": 1.9845277237034057e-05, + "loss": 0.6352, + "step": 1088 + }, + { + "epoch": 0.08, + "grad_norm": 1.3807977341594648, + "learning_rate": 1.9844836633701383e-05, + "loss": 0.6988, + "step": 1089 + }, + { + "epoch": 0.08, + "grad_norm": 1.3444731341252991, + "learning_rate": 1.9844395408813376e-05, + "loss": 0.6743, + "step": 1090 + }, + { + "epoch": 0.08, + "grad_norm": 1.307205420254287, + "learning_rate": 1.9843953562397896e-05, + "loss": 0.6507, + "step": 1091 + }, + { + "epoch": 0.08, + "grad_norm": 1.384103764814393, + "learning_rate": 1.984351109448284e-05, + "loss": 0.7041, + "step": 1092 + }, + { + "epoch": 0.08, + "grad_norm": 1.311451084075518, + "learning_rate": 1.9843068005096146e-05, + "loss": 0.6769, + "step": 1093 + }, + { + "epoch": 0.08, + "grad_norm": 1.475281093774608, + "learning_rate": 1.9842624294265783e-05, + "loss": 0.6757, + "step": 1094 + }, + { + "epoch": 0.08, + "grad_norm": 1.2847458204330213, + "learning_rate": 1.9842179962019768e-05, + "loss": 0.629, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 1.477745510687581, + "learning_rate": 1.9841735008386154e-05, + "loss": 0.7262, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 1.419998775535378, + "learning_rate": 1.984128943339303e-05, + "loss": 0.6711, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 1.3256467698183303, + "learning_rate": 1.984084323706853e-05, + "loss": 0.6787, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 1.4531825884820975, + "learning_rate": 1.9840396419440825e-05, + "loss": 0.6695, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 1.3048689481914824, + "learning_rate": 1.9839948980538124e-05, + "loss": 0.6897, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.4104249650644398, + "learning_rate": 1.9839500920388676e-05, + "loss": 0.6653, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 1.4111668988503194, + "learning_rate": 1.983905223902077e-05, + "loss": 0.7027, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 1.36188376428691, + "learning_rate": 1.9838602936462732e-05, + "loss": 0.6676, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 1.2403181873747628, + "learning_rate": 1.983815301274293e-05, + "loss": 0.6282, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 1.3271663122818982, + "learning_rate": 1.9837702467889772e-05, + "loss": 0.6733, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 1.4194330898476917, + "learning_rate": 1.9837251301931704e-05, + "loss": 0.6505, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 1.2082126107692968, + "learning_rate": 1.9836799514897204e-05, + "loss": 0.6707, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 1.3665789312408116, + "learning_rate": 1.98363471068148e-05, + "loss": 0.6748, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 1.3399140809398349, + "learning_rate": 1.9835894077713055e-05, + "loss": 0.6376, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 1.3799549469987997, + "learning_rate": 1.9835440427620568e-05, + "loss": 0.677, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 1.2417224517340832, + "learning_rate": 1.9834986156565984e-05, + "loss": 0.6271, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 1.2371500907728634, + "learning_rate": 1.9834531264577984e-05, + "loss": 0.6623, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 1.415355533780316, + "learning_rate": 1.9834075751685283e-05, + "loss": 0.6715, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 1.555496312141032, + "learning_rate": 1.9833619617916645e-05, + "loss": 0.6729, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 1.52222421700903, + "learning_rate": 1.9833162863300863e-05, + "loss": 0.6321, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 1.4989224378128634, + "learning_rate": 1.983270548786678e-05, + "loss": 0.7222, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 1.4995699901928754, + "learning_rate": 1.9832247491643267e-05, + "loss": 0.7012, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 1.333947780213303, + "learning_rate": 1.983178887465924e-05, + "loss": 0.6509, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 1.647961588495242, + "learning_rate": 1.983132963694366e-05, + "loss": 0.8132, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 1.5941082420883146, + "learning_rate": 1.9830869778525517e-05, + "loss": 0.6881, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.4562004932289037, + "learning_rate": 1.9830409299433845e-05, + "loss": 0.6922, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 1.3253266867481204, + "learning_rate": 1.9829948199697713e-05, + "loss": 0.6131, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 1.3194267868511274, + "learning_rate": 1.9829486479346237e-05, + "loss": 0.6601, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 1.3032366057614475, + "learning_rate": 1.9829024138408567e-05, + "loss": 0.6463, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 1.3526579105754335, + "learning_rate": 1.982856117691389e-05, + "loss": 0.6676, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 1.4044047647130808, + "learning_rate": 1.9828097594891435e-05, + "loss": 0.6607, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 1.4112826680270278, + "learning_rate": 1.982763339237048e-05, + "loss": 0.5893, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 1.4209828354982046, + "learning_rate": 1.982716856938032e-05, + "loss": 0.6859, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 1.3437338983325133, + "learning_rate": 1.9826703125950302e-05, + "loss": 0.6214, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 1.3298890459774835, + "learning_rate": 1.982623706210982e-05, + "loss": 0.6727, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 1.3315551443910691, + "learning_rate": 1.98257703778883e-05, + "loss": 0.6581, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 1.2907969972106836, + "learning_rate": 1.9825303073315197e-05, + "loss": 0.6066, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 1.2909279936396814, + "learning_rate": 1.9824835148420023e-05, + "loss": 0.5687, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 1.424877894339963, + "learning_rate": 1.982436660323231e-05, + "loss": 0.7123, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 1.4897768210633964, + "learning_rate": 1.9823897437781654e-05, + "loss": 0.7402, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 1.4471998109666606, + "learning_rate": 1.9823427652097663e-05, + "loss": 0.6692, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 1.4191902721067329, + "learning_rate": 1.9822957246210003e-05, + "loss": 0.6479, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 1.4409011741597924, + "learning_rate": 1.9822486220148376e-05, + "loss": 0.6955, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 1.3597189369504, + "learning_rate": 1.9822014573942514e-05, + "loss": 0.6567, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 1.4575559269691574, + "learning_rate": 1.98215423076222e-05, + "loss": 0.6694, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 1.3492958599027343, + "learning_rate": 1.982106942121724e-05, + "loss": 0.6675, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 1.3921489723089153, + "learning_rate": 1.9820595914757505e-05, + "loss": 0.649, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 1.3440147848735216, + "learning_rate": 1.9820121788272873e-05, + "loss": 0.6429, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 1.3360572002837734, + "learning_rate": 1.9819647041793298e-05, + "loss": 0.7376, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 1.2438325844955456, + "learning_rate": 1.9819171675348736e-05, + "loss": 0.6687, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 1.3711687377246966, + "learning_rate": 1.9818695688969207e-05, + "loss": 0.7174, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 1.3686320929061715, + "learning_rate": 1.9818219082684762e-05, + "loss": 0.6525, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 1.4988193633991862, + "learning_rate": 1.9817741856525494e-05, + "loss": 0.6038, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 1.430496903897148, + "learning_rate": 1.9817264010521524e-05, + "loss": 0.678, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 1.3119959998883932, + "learning_rate": 1.981678554470303e-05, + "loss": 0.6368, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 1.5116578713315048, + "learning_rate": 1.9816306459100218e-05, + "loss": 0.7125, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 1.3733899486933328, + "learning_rate": 1.981582675374333e-05, + "loss": 0.7114, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 1.4422723912540691, + "learning_rate": 1.9815346428662655e-05, + "loss": 0.6551, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 1.2559756022015318, + "learning_rate": 1.9814865483888527e-05, + "loss": 0.6528, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 1.374362104007575, + "learning_rate": 1.9814383919451298e-05, + "loss": 0.6326, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 1.340186606267172, + "learning_rate": 1.9813901735381373e-05, + "loss": 0.6691, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 1.4115398097246712, + "learning_rate": 1.9813418931709203e-05, + "loss": 0.6561, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 1.382567173807268, + "learning_rate": 1.9812935508465263e-05, + "loss": 0.6679, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 1.4407357824929055, + "learning_rate": 1.981245146568008e-05, + "loss": 0.694, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 1.187108276406847, + "learning_rate": 1.9811966803384208e-05, + "loss": 0.6749, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 1.3297259835076982, + "learning_rate": 1.9811481521608245e-05, + "loss": 0.6108, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 1.335911756122454, + "learning_rate": 1.981099562038284e-05, + "loss": 0.6605, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 1.3371802455832802, + "learning_rate": 1.981050909973866e-05, + "loss": 0.6176, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 1.3062167314007176, + "learning_rate": 1.981002195970642e-05, + "loss": 0.5866, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 1.32331668883329, + "learning_rate": 1.980953420031689e-05, + "loss": 0.6782, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 1.202468742630543, + "learning_rate": 1.980904582160085e-05, + "loss": 0.5988, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 1.2650604401726164, + "learning_rate": 1.980855682358914e-05, + "loss": 0.6952, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 1.3592897585747172, + "learning_rate": 1.9808067206312632e-05, + "loss": 0.6597, + "step": 1168 + }, + { + "epoch": 0.09, + "grad_norm": 1.4574664491223743, + "learning_rate": 1.980757696980224e-05, + "loss": 0.5817, + "step": 1169 + }, + { + "epoch": 0.09, + "grad_norm": 1.4828517124791651, + "learning_rate": 1.980708611408891e-05, + "loss": 0.739, + "step": 1170 + }, + { + "epoch": 0.09, + "grad_norm": 1.2886534995682364, + "learning_rate": 1.9806594639203637e-05, + "loss": 0.6297, + "step": 1171 + }, + { + "epoch": 0.09, + "grad_norm": 1.5097725733588492, + "learning_rate": 1.980610254517745e-05, + "loss": 0.7327, + "step": 1172 + }, + { + "epoch": 0.09, + "grad_norm": 1.3970631500448782, + "learning_rate": 1.980560983204142e-05, + "loss": 0.7241, + "step": 1173 + }, + { + "epoch": 0.09, + "grad_norm": 1.215500730620227, + "learning_rate": 1.9805116499826646e-05, + "loss": 0.6264, + "step": 1174 + }, + { + "epoch": 0.09, + "grad_norm": 1.360247978434702, + "learning_rate": 1.980462254856428e-05, + "loss": 0.6927, + "step": 1175 + }, + { + "epoch": 0.09, + "grad_norm": 1.2213126407984554, + "learning_rate": 1.9804127978285506e-05, + "loss": 0.6612, + "step": 1176 + }, + { + "epoch": 0.09, + "grad_norm": 1.4275996905255162, + "learning_rate": 1.9803632789021553e-05, + "loss": 0.7202, + "step": 1177 + }, + { + "epoch": 0.09, + "grad_norm": 1.2579692482275089, + "learning_rate": 1.9803136980803687e-05, + "loss": 0.6404, + "step": 1178 + }, + { + "epoch": 0.09, + "grad_norm": 1.2835999779483394, + "learning_rate": 1.9802640553663196e-05, + "loss": 0.6448, + "step": 1179 + }, + { + "epoch": 0.09, + "grad_norm": 1.3310670915055065, + "learning_rate": 1.980214350763144e-05, + "loss": 0.696, + "step": 1180 + }, + { + "epoch": 0.09, + "grad_norm": 1.346129904785629, + "learning_rate": 1.9801645842739785e-05, + "loss": 0.7355, + "step": 1181 + }, + { + "epoch": 0.09, + "grad_norm": 1.406778321098541, + "learning_rate": 1.980114755901966e-05, + "loss": 0.7006, + "step": 1182 + }, + { + "epoch": 0.09, + "grad_norm": 1.3086238117465867, + "learning_rate": 1.9800648656502524e-05, + "loss": 0.6399, + "step": 1183 + }, + { + "epoch": 0.09, + "grad_norm": 1.5989505962304749, + "learning_rate": 1.9800149135219873e-05, + "loss": 0.6373, + "step": 1184 + }, + { + "epoch": 0.09, + "grad_norm": 1.4296688224531342, + "learning_rate": 1.9799648995203246e-05, + "loss": 0.6997, + "step": 1185 + }, + { + "epoch": 0.09, + "grad_norm": 1.382837887978123, + "learning_rate": 1.979914823648422e-05, + "loss": 0.663, + "step": 1186 + }, + { + "epoch": 0.09, + "grad_norm": 1.2721115417218372, + "learning_rate": 1.9798646859094405e-05, + "loss": 0.6164, + "step": 1187 + }, + { + "epoch": 0.09, + "grad_norm": 1.3852775283966918, + "learning_rate": 1.979814486306546e-05, + "loss": 0.6401, + "step": 1188 + }, + { + "epoch": 0.09, + "grad_norm": 1.3031675885237042, + "learning_rate": 1.9797642248429078e-05, + "loss": 0.6483, + "step": 1189 + }, + { + "epoch": 0.09, + "grad_norm": 1.3489120161237507, + "learning_rate": 1.979713901521699e-05, + "loss": 0.673, + "step": 1190 + }, + { + "epoch": 0.09, + "grad_norm": 1.2828958221770725, + "learning_rate": 1.9796635163460976e-05, + "loss": 0.6076, + "step": 1191 + }, + { + "epoch": 0.09, + "grad_norm": 1.3223242697395923, + "learning_rate": 1.9796130693192835e-05, + "loss": 0.6755, + "step": 1192 + }, + { + "epoch": 0.09, + "grad_norm": 1.4134175637100281, + "learning_rate": 1.979562560444442e-05, + "loss": 0.6333, + "step": 1193 + }, + { + "epoch": 0.09, + "grad_norm": 1.336011427233703, + "learning_rate": 1.9795119897247627e-05, + "loss": 0.6573, + "step": 1194 + }, + { + "epoch": 0.09, + "grad_norm": 1.4294319966790596, + "learning_rate": 1.9794613571634378e-05, + "loss": 0.6447, + "step": 1195 + }, + { + "epoch": 0.09, + "grad_norm": 1.3986745026150444, + "learning_rate": 1.979410662763664e-05, + "loss": 0.7732, + "step": 1196 + }, + { + "epoch": 0.09, + "grad_norm": 1.4299542954617028, + "learning_rate": 1.9793599065286417e-05, + "loss": 0.6712, + "step": 1197 + }, + { + "epoch": 0.09, + "grad_norm": 1.3742971357811897, + "learning_rate": 1.9793090884615756e-05, + "loss": 0.665, + "step": 1198 + }, + { + "epoch": 0.09, + "grad_norm": 1.287680376250613, + "learning_rate": 1.9792582085656744e-05, + "loss": 0.6295, + "step": 1199 + }, + { + "epoch": 0.09, + "grad_norm": 1.2653865825119586, + "learning_rate": 1.97920726684415e-05, + "loss": 0.6411, + "step": 1200 + }, + { + "epoch": 0.09, + "grad_norm": 1.364137528181507, + "learning_rate": 1.9791562633002194e-05, + "loss": 0.6543, + "step": 1201 + }, + { + "epoch": 0.09, + "grad_norm": 1.4053051635469254, + "learning_rate": 1.9791051979371017e-05, + "loss": 0.7137, + "step": 1202 + }, + { + "epoch": 0.09, + "grad_norm": 1.4748010371873694, + "learning_rate": 1.9790540707580213e-05, + "loss": 0.6858, + "step": 1203 + }, + { + "epoch": 0.09, + "grad_norm": 1.3118999108452385, + "learning_rate": 1.9790028817662065e-05, + "loss": 0.6154, + "step": 1204 + }, + { + "epoch": 0.09, + "grad_norm": 1.3993442089378476, + "learning_rate": 1.978951630964888e-05, + "loss": 0.6687, + "step": 1205 + }, + { + "epoch": 0.09, + "grad_norm": 1.4223187770618666, + "learning_rate": 1.978900318357303e-05, + "loss": 0.6454, + "step": 1206 + }, + { + "epoch": 0.09, + "grad_norm": 1.3651858690892422, + "learning_rate": 1.9788489439466902e-05, + "loss": 0.6481, + "step": 1207 + }, + { + "epoch": 0.09, + "grad_norm": 1.400000226497632, + "learning_rate": 1.9787975077362937e-05, + "loss": 0.6561, + "step": 1208 + }, + { + "epoch": 0.09, + "grad_norm": 1.3360728145132874, + "learning_rate": 1.9787460097293603e-05, + "loss": 0.6034, + "step": 1209 + }, + { + "epoch": 0.09, + "grad_norm": 1.323599926777985, + "learning_rate": 1.9786944499291417e-05, + "loss": 0.6412, + "step": 1210 + }, + { + "epoch": 0.09, + "grad_norm": 1.4149072979326822, + "learning_rate": 1.9786428283388927e-05, + "loss": 0.688, + "step": 1211 + }, + { + "epoch": 0.09, + "grad_norm": 1.414805685928908, + "learning_rate": 1.9785911449618732e-05, + "loss": 0.7068, + "step": 1212 + }, + { + "epoch": 0.09, + "grad_norm": 1.4754153846721507, + "learning_rate": 1.978539399801346e-05, + "loss": 0.6837, + "step": 1213 + }, + { + "epoch": 0.09, + "grad_norm": 1.3753551111292275, + "learning_rate": 1.9784875928605777e-05, + "loss": 0.6223, + "step": 1214 + }, + { + "epoch": 0.09, + "grad_norm": 1.4251710186780597, + "learning_rate": 1.978435724142839e-05, + "loss": 0.7228, + "step": 1215 + }, + { + "epoch": 0.09, + "grad_norm": 1.3832231763082845, + "learning_rate": 1.9783837936514053e-05, + "loss": 0.6726, + "step": 1216 + }, + { + "epoch": 0.09, + "grad_norm": 1.4500479986204673, + "learning_rate": 1.9783318013895552e-05, + "loss": 0.6548, + "step": 1217 + }, + { + "epoch": 0.09, + "grad_norm": 1.3005747588250538, + "learning_rate": 1.9782797473605708e-05, + "loss": 0.6567, + "step": 1218 + }, + { + "epoch": 0.09, + "grad_norm": 1.5355761393007654, + "learning_rate": 1.978227631567738e-05, + "loss": 0.7227, + "step": 1219 + }, + { + "epoch": 0.09, + "grad_norm": 1.4102631842732851, + "learning_rate": 1.9781754540143486e-05, + "loss": 0.742, + "step": 1220 + }, + { + "epoch": 0.09, + "grad_norm": 1.3439314520106866, + "learning_rate": 1.9781232147036958e-05, + "loss": 0.7207, + "step": 1221 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944073790343496, + "learning_rate": 1.978070913639078e-05, + "loss": 0.676, + "step": 1222 + }, + { + "epoch": 0.09, + "grad_norm": 1.3960356731748198, + "learning_rate": 1.9780185508237976e-05, + "loss": 0.706, + "step": 1223 + }, + { + "epoch": 0.09, + "grad_norm": 1.399362907885586, + "learning_rate": 1.9779661262611598e-05, + "loss": 0.6458, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 1.4304060667545093, + "learning_rate": 1.9779136399544747e-05, + "loss": 0.7019, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 1.4063710054682113, + "learning_rate": 1.9778610919070563e-05, + "loss": 0.6423, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 1.3243160155817348, + "learning_rate": 1.977808482122222e-05, + "loss": 0.7011, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 1.4000265800132696, + "learning_rate": 1.9777558106032936e-05, + "loss": 0.6679, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 1.3050823413697417, + "learning_rate": 1.9777030773535962e-05, + "loss": 0.619, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 1.2243187664490291, + "learning_rate": 1.977650282376459e-05, + "loss": 0.5934, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 1.4588698989508349, + "learning_rate": 1.977597425675216e-05, + "loss": 0.6878, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 1.415595556630849, + "learning_rate": 1.9775445072532035e-05, + "loss": 0.6864, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 1.4319039762205037, + "learning_rate": 1.9774915271137625e-05, + "loss": 0.7228, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 1.2616836968149805, + "learning_rate": 1.9774384852602387e-05, + "loss": 0.6162, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 1.4017493813399216, + "learning_rate": 1.97738538169598e-05, + "loss": 0.7778, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 1.4409173896294583, + "learning_rate": 1.9773322164243394e-05, + "loss": 0.6846, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 1.3741735228642569, + "learning_rate": 1.9772789894486743e-05, + "loss": 0.6303, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 1.343105560843702, + "learning_rate": 1.9772257007723442e-05, + "loss": 0.6178, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 1.2393720377291753, + "learning_rate": 1.9771723503987133e-05, + "loss": 0.5986, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 1.3828590083517824, + "learning_rate": 1.9771189383311507e-05, + "loss": 0.7265, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 1.3367191727156558, + "learning_rate": 1.9770654645730286e-05, + "loss": 0.6923, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 1.3043303314880903, + "learning_rate": 1.9770119291277223e-05, + "loss": 0.6321, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 1.3399698626274916, + "learning_rate": 1.9769583319986125e-05, + "loss": 0.6622, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 1.2371714338823765, + "learning_rate": 1.9769046731890825e-05, + "loss": 0.601, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 1.417887199305655, + "learning_rate": 1.9768509527025204e-05, + "loss": 0.6164, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 1.3190168083716396, + "learning_rate": 1.9767971705423176e-05, + "loss": 0.6261, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 1.3498563637085053, + "learning_rate": 1.97674332671187e-05, + "loss": 0.6424, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 1.430250296795552, + "learning_rate": 1.9766894212145767e-05, + "loss": 0.6578, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 1.3691569502832655, + "learning_rate": 1.9766354540538414e-05, + "loss": 0.6788, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 1.339045161487761, + "learning_rate": 1.976581425233071e-05, + "loss": 0.6605, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 1.2840355621802468, + "learning_rate": 1.976527334755677e-05, + "loss": 0.6267, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 1.2673447310319617, + "learning_rate": 1.976473182625074e-05, + "loss": 0.6337, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 1.346546899612652, + "learning_rate": 1.9764189688446803e-05, + "loss": 0.6606, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 1.4501943260728969, + "learning_rate": 1.9763646934179204e-05, + "loss": 0.7229, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 1.2845667730685473, + "learning_rate": 1.97631035634822e-05, + "loss": 0.6559, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 1.397157648152016, + "learning_rate": 1.976255957639009e-05, + "loss": 0.6663, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 1.3866889628045302, + "learning_rate": 1.976201497293723e-05, + "loss": 0.64, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 1.2593359872535244, + "learning_rate": 1.9761469753158e-05, + "loss": 0.6902, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 1.3070455930006084, + "learning_rate": 1.976092391708682e-05, + "loss": 0.7115, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 1.2864158046760956, + "learning_rate": 1.9760377464758154e-05, + "loss": 0.6545, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 1.4093333455969654, + "learning_rate": 1.9759830396206504e-05, + "loss": 0.7259, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 1.2496139884021689, + "learning_rate": 1.9759282711466405e-05, + "loss": 0.6229, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 1.4022444580969724, + "learning_rate": 1.9758734410572435e-05, + "loss": 0.7355, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 1.3869299929195074, + "learning_rate": 1.9758185493559216e-05, + "loss": 0.6824, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 1.442074669818633, + "learning_rate": 1.97576359604614e-05, + "loss": 0.6732, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 1.4624501113859736, + "learning_rate": 1.9757085811313686e-05, + "loss": 0.6482, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 1.2907830517955654, + "learning_rate": 1.97565350461508e-05, + "loss": 0.6272, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 1.3325046358153128, + "learning_rate": 1.9755983665007524e-05, + "loss": 0.6327, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 1.4107812568951001, + "learning_rate": 1.975543166791866e-05, + "loss": 0.6973, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 1.3711360479280659, + "learning_rate": 1.9754879054919067e-05, + "loss": 0.6563, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 1.3825662253523505, + "learning_rate": 1.975432582604363e-05, + "loss": 0.6227, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 1.3599147821121875, + "learning_rate": 1.9753771981327278e-05, + "loss": 0.655, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 1.3522012336867513, + "learning_rate": 1.9753217520804976e-05, + "loss": 0.6367, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 1.3559167215127172, + "learning_rate": 1.9752662444511738e-05, + "loss": 0.6678, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 1.3938523464515515, + "learning_rate": 1.9752106752482596e-05, + "loss": 0.6849, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 1.2887214440660997, + "learning_rate": 1.9751550444752642e-05, + "loss": 0.6558, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 1.506260047974055, + "learning_rate": 1.9750993521356997e-05, + "loss": 0.6281, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 1.2844096510916296, + "learning_rate": 1.9750435982330823e-05, + "loss": 0.6187, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 1.3041757276583863, + "learning_rate": 1.9749877827709323e-05, + "loss": 0.6785, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 1.3923368311810609, + "learning_rate": 1.974931905752773e-05, + "loss": 0.6603, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 1.4022437354845803, + "learning_rate": 1.9748759671821323e-05, + "loss": 0.6251, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 1.2979961742380008, + "learning_rate": 1.9748199670625423e-05, + "loss": 0.6155, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 1.4421464213661455, + "learning_rate": 1.9747639053975386e-05, + "loss": 0.69, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 1.2610511548937215, + "learning_rate": 1.9747077821906602e-05, + "loss": 0.5852, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 1.2882226693533523, + "learning_rate": 1.9746515974454508e-05, + "loss": 0.6923, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 1.409332753497726, + "learning_rate": 1.9745953511654573e-05, + "loss": 0.7094, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 1.2603695390366016, + "learning_rate": 1.974539043354231e-05, + "loss": 0.5916, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 1.4142534327297618, + "learning_rate": 1.9744826740153272e-05, + "loss": 0.7057, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 1.284603707400155, + "learning_rate": 1.9744262431523045e-05, + "loss": 0.6134, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 1.2263677162962707, + "learning_rate": 1.9743697507687253e-05, + "loss": 0.6522, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 1.3702936256004872, + "learning_rate": 1.974313196868157e-05, + "loss": 0.6236, + "step": 1291 + }, + { + "epoch": 0.1, + "grad_norm": 1.3339402734663026, + "learning_rate": 1.97425658145417e-05, + "loss": 0.6803, + "step": 1292 + }, + { + "epoch": 0.1, + "grad_norm": 1.3788075614388753, + "learning_rate": 1.974199904530338e-05, + "loss": 0.6654, + "step": 1293 + }, + { + "epoch": 0.1, + "grad_norm": 1.3536417408326378, + "learning_rate": 1.9741431661002403e-05, + "loss": 0.6845, + "step": 1294 + }, + { + "epoch": 0.1, + "grad_norm": 1.3127354456120703, + "learning_rate": 1.974086366167458e-05, + "loss": 0.6953, + "step": 1295 + }, + { + "epoch": 0.1, + "grad_norm": 1.4802084016990065, + "learning_rate": 1.9740295047355784e-05, + "loss": 0.6945, + "step": 1296 + }, + { + "epoch": 0.1, + "grad_norm": 1.2733851697033904, + "learning_rate": 1.9739725818081905e-05, + "loss": 0.6042, + "step": 1297 + }, + { + "epoch": 0.1, + "grad_norm": 1.3102079541644611, + "learning_rate": 1.9739155973888887e-05, + "loss": 0.6492, + "step": 1298 + }, + { + "epoch": 0.1, + "grad_norm": 1.3442931297411025, + "learning_rate": 1.97385855148127e-05, + "loss": 0.6496, + "step": 1299 + }, + { + "epoch": 0.1, + "grad_norm": 1.322587710754856, + "learning_rate": 1.9738014440889368e-05, + "loss": 0.6683, + "step": 1300 + }, + { + "epoch": 0.1, + "grad_norm": 1.4261772546429026, + "learning_rate": 1.9737442752154944e-05, + "loss": 0.6877, + "step": 1301 + }, + { + "epoch": 0.1, + "grad_norm": 1.4571349897092112, + "learning_rate": 1.973687044864552e-05, + "loss": 0.6411, + "step": 1302 + }, + { + "epoch": 0.1, + "grad_norm": 1.2512433544062485, + "learning_rate": 1.9736297530397227e-05, + "loss": 0.6064, + "step": 1303 + }, + { + "epoch": 0.1, + "grad_norm": 1.3203760165674958, + "learning_rate": 1.973572399744624e-05, + "loss": 0.6353, + "step": 1304 + }, + { + "epoch": 0.1, + "grad_norm": 1.3100388021036906, + "learning_rate": 1.9735149849828767e-05, + "loss": 0.6889, + "step": 1305 + }, + { + "epoch": 0.1, + "grad_norm": 1.5577264729490756, + "learning_rate": 1.973457508758106e-05, + "loss": 0.7482, + "step": 1306 + }, + { + "epoch": 0.1, + "grad_norm": 1.399440512237503, + "learning_rate": 1.9733999710739398e-05, + "loss": 0.6147, + "step": 1307 + }, + { + "epoch": 0.1, + "grad_norm": 1.429925283867604, + "learning_rate": 1.9733423719340118e-05, + "loss": 0.7181, + "step": 1308 + }, + { + "epoch": 0.1, + "grad_norm": 1.2958207038844536, + "learning_rate": 1.973284711341958e-05, + "loss": 0.6567, + "step": 1309 + }, + { + "epoch": 0.1, + "grad_norm": 1.5082136025863881, + "learning_rate": 1.9732269893014188e-05, + "loss": 0.7189, + "step": 1310 + }, + { + "epoch": 0.1, + "grad_norm": 1.2778157053492332, + "learning_rate": 1.9731692058160387e-05, + "loss": 0.6148, + "step": 1311 + }, + { + "epoch": 0.1, + "grad_norm": 1.3436301865501994, + "learning_rate": 1.9731113608894656e-05, + "loss": 0.6477, + "step": 1312 + }, + { + "epoch": 0.1, + "grad_norm": 1.4867034317967422, + "learning_rate": 1.9730534545253516e-05, + "loss": 0.6589, + "step": 1313 + }, + { + "epoch": 0.1, + "grad_norm": 1.5054852012127375, + "learning_rate": 1.9729954867273528e-05, + "loss": 0.7017, + "step": 1314 + }, + { + "epoch": 0.1, + "grad_norm": 1.334658763650278, + "learning_rate": 1.9729374574991288e-05, + "loss": 0.6586, + "step": 1315 + }, + { + "epoch": 0.1, + "grad_norm": 1.4171479006179815, + "learning_rate": 1.9728793668443437e-05, + "loss": 0.6929, + "step": 1316 + }, + { + "epoch": 0.1, + "grad_norm": 1.3340001028538426, + "learning_rate": 1.9728212147666647e-05, + "loss": 0.6363, + "step": 1317 + }, + { + "epoch": 0.1, + "grad_norm": 1.4973064716598479, + "learning_rate": 1.9727630012697633e-05, + "loss": 0.6183, + "step": 1318 + }, + { + "epoch": 0.1, + "grad_norm": 1.3671159562056532, + "learning_rate": 1.9727047263573148e-05, + "loss": 0.6643, + "step": 1319 + }, + { + "epoch": 0.1, + "grad_norm": 1.3051263219707756, + "learning_rate": 1.9726463900329985e-05, + "loss": 0.6885, + "step": 1320 + }, + { + "epoch": 0.1, + "grad_norm": 1.3832130498565052, + "learning_rate": 1.972587992300497e-05, + "loss": 0.6614, + "step": 1321 + }, + { + "epoch": 0.1, + "grad_norm": 1.427180191799803, + "learning_rate": 1.972529533163498e-05, + "loss": 0.6415, + "step": 1322 + }, + { + "epoch": 0.1, + "grad_norm": 1.402858374056527, + "learning_rate": 1.972471012625692e-05, + "loss": 0.6604, + "step": 1323 + }, + { + "epoch": 0.1, + "grad_norm": 1.3144097285596168, + "learning_rate": 1.9724124306907737e-05, + "loss": 0.7101, + "step": 1324 + }, + { + "epoch": 0.1, + "grad_norm": 1.4888946790883095, + "learning_rate": 1.9723537873624418e-05, + "loss": 0.7099, + "step": 1325 + }, + { + "epoch": 0.1, + "grad_norm": 1.3923004858063108, + "learning_rate": 1.9722950826443983e-05, + "loss": 0.635, + "step": 1326 + }, + { + "epoch": 0.1, + "grad_norm": 1.3574628864926497, + "learning_rate": 1.97223631654035e-05, + "loss": 0.6596, + "step": 1327 + }, + { + "epoch": 0.1, + "grad_norm": 1.3104516348648454, + "learning_rate": 1.972177489054007e-05, + "loss": 0.6785, + "step": 1328 + }, + { + "epoch": 0.1, + "grad_norm": 1.494565576696541, + "learning_rate": 1.972118600189083e-05, + "loss": 0.7091, + "step": 1329 + }, + { + "epoch": 0.1, + "grad_norm": 1.2904097943384203, + "learning_rate": 1.9720596499492965e-05, + "loss": 0.6412, + "step": 1330 + }, + { + "epoch": 0.1, + "grad_norm": 1.2696502160304957, + "learning_rate": 1.9720006383383694e-05, + "loss": 0.6497, + "step": 1331 + }, + { + "epoch": 0.1, + "grad_norm": 1.3764578286946552, + "learning_rate": 1.971941565360027e-05, + "loss": 0.6016, + "step": 1332 + }, + { + "epoch": 0.1, + "grad_norm": 1.3056387716966713, + "learning_rate": 1.971882431017999e-05, + "loss": 0.6291, + "step": 1333 + }, + { + "epoch": 0.1, + "grad_norm": 1.3281560108827877, + "learning_rate": 1.9718232353160187e-05, + "loss": 0.6634, + "step": 1334 + }, + { + "epoch": 0.1, + "grad_norm": 1.3282381346144063, + "learning_rate": 1.971763978257824e-05, + "loss": 0.5961, + "step": 1335 + }, + { + "epoch": 0.1, + "grad_norm": 1.3181828133361073, + "learning_rate": 1.9717046598471553e-05, + "loss": 0.6113, + "step": 1336 + }, + { + "epoch": 0.1, + "grad_norm": 1.2592238096444863, + "learning_rate": 1.9716452800877582e-05, + "loss": 0.6382, + "step": 1337 + }, + { + "epoch": 0.1, + "grad_norm": 1.3874714272796158, + "learning_rate": 1.9715858389833816e-05, + "loss": 0.6665, + "step": 1338 + }, + { + "epoch": 0.1, + "grad_norm": 1.4367131277184888, + "learning_rate": 1.9715263365377782e-05, + "loss": 0.6465, + "step": 1339 + }, + { + "epoch": 0.1, + "grad_norm": 1.3441409052543936, + "learning_rate": 1.9714667727547047e-05, + "loss": 0.6306, + "step": 1340 + }, + { + "epoch": 0.1, + "grad_norm": 1.2598321468325648, + "learning_rate": 1.9714071476379216e-05, + "loss": 0.6457, + "step": 1341 + }, + { + "epoch": 0.1, + "grad_norm": 1.2656911667847015, + "learning_rate": 1.971347461191194e-05, + "loss": 0.6455, + "step": 1342 + }, + { + "epoch": 0.1, + "grad_norm": 1.393301114039878, + "learning_rate": 1.9712877134182893e-05, + "loss": 0.6345, + "step": 1343 + }, + { + "epoch": 0.1, + "grad_norm": 1.2886923518639681, + "learning_rate": 1.97122790432298e-05, + "loss": 0.6618, + "step": 1344 + }, + { + "epoch": 0.1, + "grad_norm": 1.3950811418568103, + "learning_rate": 1.9711680339090425e-05, + "loss": 0.7288, + "step": 1345 + }, + { + "epoch": 0.1, + "grad_norm": 1.4447502073441947, + "learning_rate": 1.9711081021802563e-05, + "loss": 0.7497, + "step": 1346 + }, + { + "epoch": 0.1, + "grad_norm": 1.2850127444395998, + "learning_rate": 1.9710481091404056e-05, + "loss": 0.6443, + "step": 1347 + }, + { + "epoch": 0.1, + "grad_norm": 1.4118943004462583, + "learning_rate": 1.9709880547932774e-05, + "loss": 0.6737, + "step": 1348 + }, + { + "epoch": 0.1, + "grad_norm": 1.3070703549643141, + "learning_rate": 1.9709279391426638e-05, + "loss": 0.649, + "step": 1349 + }, + { + "epoch": 0.1, + "grad_norm": 1.2349974113313673, + "learning_rate": 1.97086776219236e-05, + "loss": 0.6706, + "step": 1350 + }, + { + "epoch": 0.1, + "grad_norm": 1.3275052139122363, + "learning_rate": 1.9708075239461656e-05, + "loss": 0.5965, + "step": 1351 + }, + { + "epoch": 0.1, + "grad_norm": 1.1952545550893299, + "learning_rate": 1.9707472244078833e-05, + "loss": 0.6654, + "step": 1352 + }, + { + "epoch": 0.1, + "grad_norm": 1.2581092058034615, + "learning_rate": 1.9706868635813205e-05, + "loss": 0.6484, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 1.3635989368244346, + "learning_rate": 1.970626441470288e-05, + "loss": 0.7099, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 1.352673773822253, + "learning_rate": 1.9705659580785997e-05, + "loss": 0.7159, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 1.4107267963827326, + "learning_rate": 1.9705054134100758e-05, + "loss": 0.65, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 1.417609891853523, + "learning_rate": 1.9704448074685377e-05, + "loss": 0.6832, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 1.2732380435815898, + "learning_rate": 1.9703841402578122e-05, + "loss": 0.6452, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 1.281590346745653, + "learning_rate": 1.9703234117817293e-05, + "loss": 0.628, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 1.3332260903303768, + "learning_rate": 1.9702626220441234e-05, + "loss": 0.5842, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4164228603527387, + "learning_rate": 1.970201771048832e-05, + "loss": 0.6811, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 1.337874987434274, + "learning_rate": 1.9701408587996976e-05, + "loss": 0.6857, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 1.3376260983215107, + "learning_rate": 1.9700798853005652e-05, + "loss": 0.6693, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 1.46019224037812, + "learning_rate": 1.9700188505552847e-05, + "loss": 0.6573, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 1.3934939076482433, + "learning_rate": 1.9699577545677097e-05, + "loss": 0.6704, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 1.3225548116650585, + "learning_rate": 1.969896597341697e-05, + "loss": 0.6682, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 1.3609855144102638, + "learning_rate": 1.9698353788811083e-05, + "loss": 0.666, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 1.1670668063388954, + "learning_rate": 1.9697740991898087e-05, + "loss": 0.6495, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 1.3525111667221432, + "learning_rate": 1.9697127582716664e-05, + "loss": 0.6399, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 1.3967121925460386, + "learning_rate": 1.9696513561305548e-05, + "loss": 0.7391, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 1.2661433159340119, + "learning_rate": 1.96958989277035e-05, + "loss": 0.6653, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 1.4694807791268958, + "learning_rate": 1.9695283681949337e-05, + "loss": 0.6338, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 1.3232123677740102, + "learning_rate": 1.9694667824081888e-05, + "loss": 0.6726, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 1.4554458460743576, + "learning_rate": 1.969405135414004e-05, + "loss": 0.6913, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 1.3616768213887198, + "learning_rate": 1.969343427216272e-05, + "loss": 0.6915, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 1.1213424353628918, + "learning_rate": 1.969281657818888e-05, + "loss": 0.579, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 1.3023375402896349, + "learning_rate": 1.9692198272257523e-05, + "loss": 0.6318, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 1.2886569685708371, + "learning_rate": 1.969157935440768e-05, + "loss": 0.6205, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 1.2081973668037393, + "learning_rate": 1.9690959824678436e-05, + "loss": 0.6209, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 1.4683630413264368, + "learning_rate": 1.9690339683108894e-05, + "loss": 0.6566, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 1.3190817881106844, + "learning_rate": 1.9689718929738215e-05, + "loss": 0.6452, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 1.4568964100642856, + "learning_rate": 1.968909756460559e-05, + "loss": 0.618, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 1.3458943224257083, + "learning_rate": 1.968847558775024e-05, + "loss": 0.6899, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 1.3247938229636835, + "learning_rate": 1.9687852999211446e-05, + "loss": 0.6129, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 1.2653562001308303, + "learning_rate": 1.9687229799028506e-05, + "loss": 0.6127, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 1.3502354805046861, + "learning_rate": 1.968660598724077e-05, + "loss": 0.5831, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 1.3013737334422124, + "learning_rate": 1.9685981563887623e-05, + "loss": 0.6494, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 1.2750670882881496, + "learning_rate": 1.9685356529008487e-05, + "loss": 0.6792, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 1.363217109111308, + "learning_rate": 1.9684730882642822e-05, + "loss": 0.6512, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 1.4343266497672076, + "learning_rate": 1.9684104624830128e-05, + "loss": 0.6691, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 1.427636181002575, + "learning_rate": 1.9683477755609946e-05, + "loss": 0.6198, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 1.3264707248245353, + "learning_rate": 1.9682850275021858e-05, + "loss": 0.6036, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 1.327145114021547, + "learning_rate": 1.968222218310547e-05, + "loss": 0.6261, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 1.4332285735563004, + "learning_rate": 1.9681593479900444e-05, + "loss": 0.6474, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 1.3317978282465646, + "learning_rate": 1.9680964165446475e-05, + "loss": 0.6205, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 1.3282284864610847, + "learning_rate": 1.9680334239783286e-05, + "loss": 0.6482, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 1.2872462105842508, + "learning_rate": 1.9679703702950653e-05, + "loss": 0.5826, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 1.317884074115748, + "learning_rate": 1.9679072554988387e-05, + "loss": 0.6478, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 1.477304100823874, + "learning_rate": 1.9678440795936332e-05, + "loss": 0.727, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 1.2830047686362602, + "learning_rate": 1.9677808425834374e-05, + "loss": 0.6241, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 1.4408850412410226, + "learning_rate": 1.967717544472244e-05, + "loss": 0.7432, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 1.374670162520849, + "learning_rate": 1.967654185264049e-05, + "loss": 0.7134, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 1.4369771255233479, + "learning_rate": 1.9675907649628532e-05, + "loss": 0.6893, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 1.370569938740028, + "learning_rate": 1.9675272835726604e-05, + "loss": 0.6755, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 1.3160468270194257, + "learning_rate": 1.967463741097478e-05, + "loss": 0.6197, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 1.2821117852374126, + "learning_rate": 1.9674001375413182e-05, + "loss": 0.6588, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 1.3724511972029891, + "learning_rate": 1.9673364729081968e-05, + "loss": 0.6658, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 1.3909350703159726, + "learning_rate": 1.967272747202133e-05, + "loss": 0.6973, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 1.4262784744081567, + "learning_rate": 1.9672089604271503e-05, + "loss": 0.6834, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 1.402619358702424, + "learning_rate": 1.9671451125872757e-05, + "loss": 0.6341, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 1.2839877489194766, + "learning_rate": 1.9670812036865402e-05, + "loss": 0.5866, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 1.3311213184900068, + "learning_rate": 1.967017233728979e-05, + "loss": 0.7219, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 1.291405092436173, + "learning_rate": 1.9669532027186308e-05, + "loss": 0.666, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 1.3878211998532612, + "learning_rate": 1.966889110659538e-05, + "loss": 0.6766, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 1.3578105123173922, + "learning_rate": 1.9668249575557468e-05, + "loss": 0.6407, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 1.5231094398376817, + "learning_rate": 1.966760743411308e-05, + "loss": 0.701, + "step": 1416 + }, + { + "epoch": 0.11, + "grad_norm": 1.33094488238159, + "learning_rate": 1.9666964682302757e-05, + "loss": 0.6604, + "step": 1417 + }, + { + "epoch": 0.11, + "grad_norm": 1.4229809159629738, + "learning_rate": 1.9666321320167083e-05, + "loss": 0.6916, + "step": 1418 + }, + { + "epoch": 0.11, + "grad_norm": 1.353501312928819, + "learning_rate": 1.9665677347746667e-05, + "loss": 0.6485, + "step": 1419 + }, + { + "epoch": 0.11, + "grad_norm": 1.4050817511317932, + "learning_rate": 1.9665032765082174e-05, + "loss": 0.6948, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 1.3017059338728252, + "learning_rate": 1.96643875722143e-05, + "loss": 0.6522, + "step": 1421 + }, + { + "epoch": 0.11, + "grad_norm": 1.4904558609624012, + "learning_rate": 1.966374176918377e-05, + "loss": 0.666, + "step": 1422 + }, + { + "epoch": 0.11, + "grad_norm": 1.2718702630182106, + "learning_rate": 1.966309535603137e-05, + "loss": 0.6637, + "step": 1423 + }, + { + "epoch": 0.11, + "grad_norm": 1.3190037035857545, + "learning_rate": 1.9662448332797905e-05, + "loss": 0.6748, + "step": 1424 + }, + { + "epoch": 0.11, + "grad_norm": 1.2997543818184787, + "learning_rate": 1.9661800699524218e-05, + "loss": 0.6677, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 1.263777243512033, + "learning_rate": 1.966115245625121e-05, + "loss": 0.6109, + "step": 1426 + }, + { + "epoch": 0.11, + "grad_norm": 1.397406810935863, + "learning_rate": 1.96605036030198e-05, + "loss": 0.6861, + "step": 1427 + }, + { + "epoch": 0.11, + "grad_norm": 1.34304680278459, + "learning_rate": 1.9659854139870963e-05, + "loss": 0.6652, + "step": 1428 + }, + { + "epoch": 0.11, + "grad_norm": 1.3773783575027967, + "learning_rate": 1.965920406684569e-05, + "loss": 0.6398, + "step": 1429 + }, + { + "epoch": 0.11, + "grad_norm": 1.2313327722756415, + "learning_rate": 1.965855338398503e-05, + "loss": 0.5613, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 1.2920659996394597, + "learning_rate": 1.965790209133006e-05, + "loss": 0.5961, + "step": 1431 + }, + { + "epoch": 0.11, + "grad_norm": 1.3497886704234359, + "learning_rate": 1.9657250188921906e-05, + "loss": 0.6261, + "step": 1432 + }, + { + "epoch": 0.11, + "grad_norm": 1.2567181772877916, + "learning_rate": 1.9656597676801726e-05, + "loss": 0.5708, + "step": 1433 + }, + { + "epoch": 0.11, + "grad_norm": 1.28523026922812, + "learning_rate": 1.9655944555010705e-05, + "loss": 0.6334, + "step": 1434 + }, + { + "epoch": 0.11, + "grad_norm": 1.3842284717039726, + "learning_rate": 1.9655290823590095e-05, + "loss": 0.6412, + "step": 1435 + }, + { + "epoch": 0.11, + "grad_norm": 1.5027082154882883, + "learning_rate": 1.9654636482581157e-05, + "loss": 0.6968, + "step": 1436 + }, + { + "epoch": 0.11, + "grad_norm": 1.2456872450401624, + "learning_rate": 1.9653981532025206e-05, + "loss": 0.6489, + "step": 1437 + }, + { + "epoch": 0.11, + "grad_norm": 1.4398361793687466, + "learning_rate": 1.9653325971963594e-05, + "loss": 0.6881, + "step": 1438 + }, + { + "epoch": 0.11, + "grad_norm": 1.363192448825361, + "learning_rate": 1.9652669802437704e-05, + "loss": 0.6535, + "step": 1439 + }, + { + "epoch": 0.11, + "grad_norm": 1.5079080818201034, + "learning_rate": 1.9652013023488972e-05, + "loss": 0.7191, + "step": 1440 + }, + { + "epoch": 0.11, + "grad_norm": 1.3701218423472024, + "learning_rate": 1.9651355635158863e-05, + "loss": 0.6231, + "step": 1441 + }, + { + "epoch": 0.11, + "grad_norm": 1.4454980679541667, + "learning_rate": 1.965069763748887e-05, + "loss": 0.7062, + "step": 1442 + }, + { + "epoch": 0.11, + "grad_norm": 1.4891703995862617, + "learning_rate": 1.9650039030520552e-05, + "loss": 0.6624, + "step": 1443 + }, + { + "epoch": 0.11, + "grad_norm": 1.3884134172801812, + "learning_rate": 1.964937981429548e-05, + "loss": 0.6836, + "step": 1444 + }, + { + "epoch": 0.11, + "grad_norm": 1.370083820048184, + "learning_rate": 1.9648719988855278e-05, + "loss": 0.6647, + "step": 1445 + }, + { + "epoch": 0.11, + "grad_norm": 1.4064895002079651, + "learning_rate": 1.9648059554241596e-05, + "loss": 0.6579, + "step": 1446 + }, + { + "epoch": 0.11, + "grad_norm": 1.3813042470368648, + "learning_rate": 1.9647398510496144e-05, + "loss": 0.6635, + "step": 1447 + }, + { + "epoch": 0.11, + "grad_norm": 1.5169813241934325, + "learning_rate": 1.9646736857660644e-05, + "loss": 0.6605, + "step": 1448 + }, + { + "epoch": 0.11, + "grad_norm": 1.326361663676536, + "learning_rate": 1.964607459577688e-05, + "loss": 0.6462, + "step": 1449 + }, + { + "epoch": 0.11, + "grad_norm": 1.4212967252138153, + "learning_rate": 1.964541172488666e-05, + "loss": 0.6866, + "step": 1450 + }, + { + "epoch": 0.11, + "grad_norm": 1.4149328261697203, + "learning_rate": 1.964474824503183e-05, + "loss": 0.6247, + "step": 1451 + }, + { + "epoch": 0.11, + "grad_norm": 1.3792144171678165, + "learning_rate": 1.9644084156254285e-05, + "loss": 0.5731, + "step": 1452 + }, + { + "epoch": 0.11, + "grad_norm": 1.379319774661222, + "learning_rate": 1.9643419458595948e-05, + "loss": 0.6721, + "step": 1453 + }, + { + "epoch": 0.11, + "grad_norm": 1.2561116061677613, + "learning_rate": 1.964275415209879e-05, + "loss": 0.6117, + "step": 1454 + }, + { + "epoch": 0.11, + "grad_norm": 1.3790784085356942, + "learning_rate": 1.964208823680481e-05, + "loss": 0.6504, + "step": 1455 + }, + { + "epoch": 0.11, + "grad_norm": 1.297836544733584, + "learning_rate": 1.9641421712756055e-05, + "loss": 0.6207, + "step": 1456 + }, + { + "epoch": 0.11, + "grad_norm": 1.2937489716898936, + "learning_rate": 1.9640754579994607e-05, + "loss": 0.6751, + "step": 1457 + }, + { + "epoch": 0.11, + "grad_norm": 1.2338673960324187, + "learning_rate": 1.9640086838562577e-05, + "loss": 0.6105, + "step": 1458 + }, + { + "epoch": 0.11, + "grad_norm": 1.3032162074089928, + "learning_rate": 1.9639418488502132e-05, + "loss": 0.6655, + "step": 1459 + }, + { + "epoch": 0.11, + "grad_norm": 2.5097029266244295, + "learning_rate": 1.9638749529855463e-05, + "loss": 0.672, + "step": 1460 + }, + { + "epoch": 0.11, + "grad_norm": 1.33194649614095, + "learning_rate": 1.9638079962664807e-05, + "loss": 0.6004, + "step": 1461 + }, + { + "epoch": 0.11, + "grad_norm": 1.4529644456991018, + "learning_rate": 1.963740978697244e-05, + "loss": 0.6921, + "step": 1462 + }, + { + "epoch": 0.11, + "grad_norm": 1.3697615887297365, + "learning_rate": 1.9636739002820665e-05, + "loss": 0.6528, + "step": 1463 + }, + { + "epoch": 0.11, + "grad_norm": 1.4559886157513002, + "learning_rate": 1.9636067610251842e-05, + "loss": 0.7375, + "step": 1464 + }, + { + "epoch": 0.11, + "grad_norm": 1.4152314639081294, + "learning_rate": 1.9635395609308354e-05, + "loss": 0.7021, + "step": 1465 + }, + { + "epoch": 0.11, + "grad_norm": 1.3959513894255413, + "learning_rate": 1.963472300003263e-05, + "loss": 0.6407, + "step": 1466 + }, + { + "epoch": 0.11, + "grad_norm": 1.3531217456100126, + "learning_rate": 1.9634049782467132e-05, + "loss": 0.6784, + "step": 1467 + }, + { + "epoch": 0.11, + "grad_norm": 1.3059475233684217, + "learning_rate": 1.9633375956654368e-05, + "loss": 0.6873, + "step": 1468 + }, + { + "epoch": 0.11, + "grad_norm": 1.3286261566671012, + "learning_rate": 1.9632701522636878e-05, + "loss": 0.6732, + "step": 1469 + }, + { + "epoch": 0.11, + "grad_norm": 1.3085873276282465, + "learning_rate": 1.9632026480457243e-05, + "loss": 0.6367, + "step": 1470 + }, + { + "epoch": 0.11, + "grad_norm": 1.2821963002337313, + "learning_rate": 1.963135083015808e-05, + "loss": 0.6455, + "step": 1471 + }, + { + "epoch": 0.11, + "grad_norm": 1.4285298647964901, + "learning_rate": 1.963067457178205e-05, + "loss": 0.7056, + "step": 1472 + }, + { + "epoch": 0.11, + "grad_norm": 1.3905576625336742, + "learning_rate": 1.9629997705371845e-05, + "loss": 0.5912, + "step": 1473 + }, + { + "epoch": 0.11, + "grad_norm": 1.318258052747074, + "learning_rate": 1.96293202309702e-05, + "loss": 0.7171, + "step": 1474 + }, + { + "epoch": 0.11, + "grad_norm": 1.41699597327307, + "learning_rate": 1.962864214861989e-05, + "loss": 0.6417, + "step": 1475 + }, + { + "epoch": 0.11, + "grad_norm": 1.2461805163941169, + "learning_rate": 1.9627963458363722e-05, + "loss": 0.5996, + "step": 1476 + }, + { + "epoch": 0.11, + "grad_norm": 1.3295378855213416, + "learning_rate": 1.962728416024455e-05, + "loss": 0.6438, + "step": 1477 + }, + { + "epoch": 0.11, + "grad_norm": 1.3806184507243149, + "learning_rate": 1.9626604254305258e-05, + "loss": 0.6045, + "step": 1478 + }, + { + "epoch": 0.11, + "grad_norm": 1.4109708171641018, + "learning_rate": 1.9625923740588767e-05, + "loss": 0.6211, + "step": 1479 + }, + { + "epoch": 0.11, + "grad_norm": 1.4741973503431585, + "learning_rate": 1.9625242619138053e-05, + "loss": 0.7101, + "step": 1480 + }, + { + "epoch": 0.11, + "grad_norm": 1.5061128356021916, + "learning_rate": 1.962456088999611e-05, + "loss": 0.642, + "step": 1481 + }, + { + "epoch": 0.11, + "grad_norm": 1.3892191520727235, + "learning_rate": 1.9623878553205986e-05, + "loss": 0.6583, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 1.2544597224204415, + "learning_rate": 1.9623195608810753e-05, + "loss": 0.6542, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 1.4103987213225255, + "learning_rate": 1.9622512056853533e-05, + "loss": 0.6287, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 1.2931652092708912, + "learning_rate": 1.962182789737748e-05, + "loss": 0.6521, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 1.405806535097205, + "learning_rate": 1.962114313042579e-05, + "loss": 0.7069, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 1.3555031807783708, + "learning_rate": 1.9620457756041695e-05, + "loss": 0.6509, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 1.2078097897359783, + "learning_rate": 1.961977177426847e-05, + "loss": 0.6376, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 1.3442926420117547, + "learning_rate": 1.9619085185149417e-05, + "loss": 0.6412, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 1.514891614152846, + "learning_rate": 1.9618397988727892e-05, + "loss": 0.7223, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 1.2858533736715312, + "learning_rate": 1.9617710185047274e-05, + "loss": 0.66, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 1.4054810859007343, + "learning_rate": 1.9617021774150996e-05, + "loss": 0.7262, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 1.4371497515361626, + "learning_rate": 1.961633275608251e-05, + "loss": 0.7114, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 1.2273913456206889, + "learning_rate": 1.9615643130885327e-05, + "loss": 0.6181, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 1.267532089173306, + "learning_rate": 1.9614952898602983e-05, + "loss": 0.6302, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 1.3370799929271104, + "learning_rate": 1.9614262059279054e-05, + "loss": 0.6239, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 1.4962116722864225, + "learning_rate": 1.9613570612957164e-05, + "loss": 0.686, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 1.2704701399733982, + "learning_rate": 1.9612878559680953e-05, + "loss": 0.6545, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 1.1557092690986264, + "learning_rate": 1.9612185899494127e-05, + "loss": 0.6234, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 1.4162674883766881, + "learning_rate": 1.9611492632440415e-05, + "loss": 0.6503, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 1.4207065518253712, + "learning_rate": 1.961079875856358e-05, + "loss": 0.69, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 1.2116551395446886, + "learning_rate": 1.961010427790744e-05, + "loss": 0.6274, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 1.3271790670042098, + "learning_rate": 1.960940919051583e-05, + "loss": 0.6931, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 1.4489016121632021, + "learning_rate": 1.9608713496432637e-05, + "loss": 0.672, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 1.43767007568, + "learning_rate": 1.9608017195701795e-05, + "loss": 0.6606, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 1.3455544709371796, + "learning_rate": 1.960732028836725e-05, + "loss": 0.6673, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 1.3500249418850947, + "learning_rate": 1.960662277447301e-05, + "loss": 0.6585, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 1.4347901717905132, + "learning_rate": 1.960592465406311e-05, + "loss": 0.7402, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 1.393893953766652, + "learning_rate": 1.960522592718163e-05, + "loss": 0.6408, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 1.0830572889221242, + "learning_rate": 1.9604526593872678e-05, + "loss": 0.5447, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 1.2706019654745004, + "learning_rate": 1.960382665418041e-05, + "loss": 0.6652, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 1.2725235169041942, + "learning_rate": 1.9603126108149014e-05, + "loss": 0.6438, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 1.2505317987260216, + "learning_rate": 1.9602424955822725e-05, + "loss": 0.5812, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 1.2459194814863555, + "learning_rate": 1.9601723197245803e-05, + "loss": 0.6672, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 1.3984968716256014, + "learning_rate": 1.960102083246256e-05, + "loss": 0.7122, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 1.450813505916241, + "learning_rate": 1.9600317861517335e-05, + "loss": 0.6872, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 1.3646439351998105, + "learning_rate": 1.959961428445451e-05, + "loss": 0.6125, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 1.2954163678793544, + "learning_rate": 1.9598910101318514e-05, + "loss": 0.5886, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 1.25640629417667, + "learning_rate": 1.9598205312153795e-05, + "loss": 0.6207, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 1.4745581207462595, + "learning_rate": 1.9597499917004856e-05, + "loss": 0.7358, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 1.319866172696702, + "learning_rate": 1.9596793915916227e-05, + "loss": 0.6598, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 1.3102367051135526, + "learning_rate": 1.959608730893249e-05, + "loss": 0.6806, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 1.2986111796759294, + "learning_rate": 1.959538009609825e-05, + "loss": 0.6256, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 1.312147865287336, + "learning_rate": 1.959467227745816e-05, + "loss": 0.6063, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 1.2882306738480376, + "learning_rate": 1.9593963853056905e-05, + "loss": 0.6839, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 1.4693887011734195, + "learning_rate": 1.9593254822939215e-05, + "loss": 0.6933, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 1.400339846574915, + "learning_rate": 1.9592545187149855e-05, + "loss": 0.6879, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 1.3867996836940029, + "learning_rate": 1.9591834945733625e-05, + "loss": 0.6391, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 1.4110075264722555, + "learning_rate": 1.9591124098735363e-05, + "loss": 0.6457, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 1.3716469977132435, + "learning_rate": 1.959041264619996e-05, + "loss": 0.6833, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 1.3963892337427741, + "learning_rate": 1.958970058817232e-05, + "loss": 0.639, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 1.5121028126421716, + "learning_rate": 1.958898792469741e-05, + "loss": 0.7052, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 1.5256500609643444, + "learning_rate": 1.958827465582022e-05, + "loss": 0.7058, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 1.4835974815285566, + "learning_rate": 1.958756078158578e-05, + "loss": 0.7067, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 1.349144111517021, + "learning_rate": 1.958684630203916e-05, + "loss": 0.624, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 1.4271914680146813, + "learning_rate": 1.958613121722548e-05, + "loss": 0.663, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 1.3336491757067224, + "learning_rate": 1.958541552718987e-05, + "loss": 0.6791, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 1.5672743525071198, + "learning_rate": 1.9584699231977526e-05, + "loss": 0.633, + "step": 1538 + }, + { + "epoch": 0.12, + "grad_norm": 1.2150944392859622, + "learning_rate": 1.9583982331633668e-05, + "loss": 0.5963, + "step": 1539 + }, + { + "epoch": 0.12, + "grad_norm": 1.2103470377939822, + "learning_rate": 1.9583264826203564e-05, + "loss": 0.5991, + "step": 1540 + }, + { + "epoch": 0.12, + "grad_norm": 1.49303671753505, + "learning_rate": 1.9582546715732503e-05, + "loss": 0.6851, + "step": 1541 + }, + { + "epoch": 0.12, + "grad_norm": 1.1446403477460383, + "learning_rate": 1.958182800026583e-05, + "loss": 0.576, + "step": 1542 + }, + { + "epoch": 0.12, + "grad_norm": 1.3180844620774201, + "learning_rate": 1.9581108679848918e-05, + "loss": 0.623, + "step": 1543 + }, + { + "epoch": 0.12, + "grad_norm": 1.2114147107357678, + "learning_rate": 1.9580388754527185e-05, + "loss": 0.598, + "step": 1544 + }, + { + "epoch": 0.12, + "grad_norm": 1.1869258998665362, + "learning_rate": 1.957966822434608e-05, + "loss": 0.6084, + "step": 1545 + }, + { + "epoch": 0.12, + "grad_norm": 1.2582476319355758, + "learning_rate": 1.9578947089351095e-05, + "loss": 0.6042, + "step": 1546 + }, + { + "epoch": 0.12, + "grad_norm": 1.2811762160478832, + "learning_rate": 1.9578225349587763e-05, + "loss": 0.5874, + "step": 1547 + }, + { + "epoch": 0.12, + "grad_norm": 1.3432276398008005, + "learning_rate": 1.9577503005101644e-05, + "loss": 0.6726, + "step": 1548 + }, + { + "epoch": 0.12, + "grad_norm": 1.3882739735049308, + "learning_rate": 1.9576780055938348e-05, + "loss": 0.6434, + "step": 1549 + }, + { + "epoch": 0.12, + "grad_norm": 1.3250526345792901, + "learning_rate": 1.957605650214352e-05, + "loss": 0.692, + "step": 1550 + }, + { + "epoch": 0.12, + "grad_norm": 1.3034620183130825, + "learning_rate": 1.9575332343762832e-05, + "loss": 0.6672, + "step": 1551 + }, + { + "epoch": 0.12, + "grad_norm": 1.2811228060972017, + "learning_rate": 1.9574607580842018e-05, + "loss": 0.6272, + "step": 1552 + }, + { + "epoch": 0.12, + "grad_norm": 1.3208527672879262, + "learning_rate": 1.9573882213426824e-05, + "loss": 0.5954, + "step": 1553 + }, + { + "epoch": 0.12, + "grad_norm": 1.2306002592018632, + "learning_rate": 1.9573156241563053e-05, + "loss": 0.6036, + "step": 1554 + }, + { + "epoch": 0.12, + "grad_norm": 1.2603278273282519, + "learning_rate": 1.957242966529654e-05, + "loss": 0.6039, + "step": 1555 + }, + { + "epoch": 0.12, + "grad_norm": 1.2573860819884088, + "learning_rate": 1.9571702484673153e-05, + "loss": 0.6227, + "step": 1556 + }, + { + "epoch": 0.12, + "grad_norm": 1.2413642602520683, + "learning_rate": 1.9570974699738805e-05, + "loss": 0.5891, + "step": 1557 + }, + { + "epoch": 0.12, + "grad_norm": 1.3526493179057293, + "learning_rate": 1.9570246310539443e-05, + "loss": 0.6823, + "step": 1558 + }, + { + "epoch": 0.12, + "grad_norm": 1.4106626578777612, + "learning_rate": 1.9569517317121058e-05, + "loss": 0.6279, + "step": 1559 + }, + { + "epoch": 0.12, + "grad_norm": 1.233501656533108, + "learning_rate": 1.9568787719529673e-05, + "loss": 0.6215, + "step": 1560 + }, + { + "epoch": 0.12, + "grad_norm": 1.4274298347024477, + "learning_rate": 1.9568057517811345e-05, + "loss": 0.674, + "step": 1561 + }, + { + "epoch": 0.12, + "grad_norm": 1.2834856485828359, + "learning_rate": 1.9567326712012188e-05, + "loss": 0.6489, + "step": 1562 + }, + { + "epoch": 0.12, + "grad_norm": 1.3149857370161504, + "learning_rate": 1.956659530217833e-05, + "loss": 0.6282, + "step": 1563 + }, + { + "epoch": 0.12, + "grad_norm": 1.3475865000490472, + "learning_rate": 1.9565863288355955e-05, + "loss": 0.6411, + "step": 1564 + }, + { + "epoch": 0.12, + "grad_norm": 1.2486982242821192, + "learning_rate": 1.956513067059128e-05, + "loss": 0.647, + "step": 1565 + }, + { + "epoch": 0.12, + "grad_norm": 1.347892629928924, + "learning_rate": 1.9564397448930552e-05, + "loss": 0.7104, + "step": 1566 + }, + { + "epoch": 0.12, + "grad_norm": 1.2879110568979681, + "learning_rate": 1.9563663623420067e-05, + "loss": 0.6506, + "step": 1567 + }, + { + "epoch": 0.12, + "grad_norm": 1.301153914305594, + "learning_rate": 1.956292919410616e-05, + "loss": 0.6711, + "step": 1568 + }, + { + "epoch": 0.12, + "grad_norm": 1.3308354712898856, + "learning_rate": 1.9562194161035194e-05, + "loss": 0.6517, + "step": 1569 + }, + { + "epoch": 0.12, + "grad_norm": 1.3553894635408312, + "learning_rate": 1.956145852425357e-05, + "loss": 0.616, + "step": 1570 + }, + { + "epoch": 0.12, + "grad_norm": 1.3778268193075993, + "learning_rate": 1.9560722283807744e-05, + "loss": 0.6846, + "step": 1571 + }, + { + "epoch": 0.12, + "grad_norm": 1.38099633040828, + "learning_rate": 1.955998543974419e-05, + "loss": 0.6395, + "step": 1572 + }, + { + "epoch": 0.12, + "grad_norm": 1.275608762631205, + "learning_rate": 1.955924799210944e-05, + "loss": 0.5865, + "step": 1573 + }, + { + "epoch": 0.12, + "grad_norm": 1.2699009283726723, + "learning_rate": 1.9558509940950036e-05, + "loss": 0.6153, + "step": 1574 + }, + { + "epoch": 0.12, + "grad_norm": 1.2834906640569594, + "learning_rate": 1.955777128631259e-05, + "loss": 0.6268, + "step": 1575 + }, + { + "epoch": 0.12, + "grad_norm": 1.1848884777916726, + "learning_rate": 1.9557032028243728e-05, + "loss": 0.6072, + "step": 1576 + }, + { + "epoch": 0.12, + "grad_norm": 1.4470417297399003, + "learning_rate": 1.9556292166790128e-05, + "loss": 0.6804, + "step": 1577 + }, + { + "epoch": 0.12, + "grad_norm": 1.3033755896239438, + "learning_rate": 1.9555551701998496e-05, + "loss": 0.5977, + "step": 1578 + }, + { + "epoch": 0.12, + "grad_norm": 1.2017973473151253, + "learning_rate": 1.9554810633915587e-05, + "loss": 0.5935, + "step": 1579 + }, + { + "epoch": 0.12, + "grad_norm": 1.2440387199024459, + "learning_rate": 1.955406896258819e-05, + "loss": 0.6438, + "step": 1580 + }, + { + "epoch": 0.12, + "grad_norm": 1.308410905048871, + "learning_rate": 1.955332668806312e-05, + "loss": 0.649, + "step": 1581 + }, + { + "epoch": 0.12, + "grad_norm": 1.4653000591365797, + "learning_rate": 1.9552583810387253e-05, + "loss": 0.6331, + "step": 1582 + }, + { + "epoch": 0.12, + "grad_norm": 1.303129076322405, + "learning_rate": 1.9551840329607483e-05, + "loss": 0.6694, + "step": 1583 + }, + { + "epoch": 0.12, + "grad_norm": 1.287249822290956, + "learning_rate": 1.9551096245770752e-05, + "loss": 0.6592, + "step": 1584 + }, + { + "epoch": 0.12, + "grad_norm": 1.329040167159197, + "learning_rate": 1.9550351558924036e-05, + "loss": 0.7047, + "step": 1585 + }, + { + "epoch": 0.12, + "grad_norm": 1.3599395894910644, + "learning_rate": 1.9549606269114358e-05, + "loss": 0.5875, + "step": 1586 + }, + { + "epoch": 0.12, + "grad_norm": 1.360770112527667, + "learning_rate": 1.9548860376388762e-05, + "loss": 0.6684, + "step": 1587 + }, + { + "epoch": 0.12, + "grad_norm": 1.401916863968186, + "learning_rate": 1.9548113880794348e-05, + "loss": 0.7019, + "step": 1588 + }, + { + "epoch": 0.12, + "grad_norm": 1.432876116256592, + "learning_rate": 1.954736678237824e-05, + "loss": 0.6843, + "step": 1589 + }, + { + "epoch": 0.12, + "grad_norm": 1.3406597355252898, + "learning_rate": 1.954661908118761e-05, + "loss": 0.6047, + "step": 1590 + }, + { + "epoch": 0.12, + "grad_norm": 1.3159802027549536, + "learning_rate": 1.9545870777269664e-05, + "loss": 0.6297, + "step": 1591 + }, + { + "epoch": 0.12, + "grad_norm": 1.3958866574174782, + "learning_rate": 1.9545121870671642e-05, + "loss": 0.6356, + "step": 1592 + }, + { + "epoch": 0.12, + "grad_norm": 1.3212703413105757, + "learning_rate": 1.9544372361440836e-05, + "loss": 0.676, + "step": 1593 + }, + { + "epoch": 0.12, + "grad_norm": 1.4324014055231864, + "learning_rate": 1.9543622249624557e-05, + "loss": 0.7143, + "step": 1594 + }, + { + "epoch": 0.12, + "grad_norm": 1.3510361898051917, + "learning_rate": 1.9542871535270168e-05, + "loss": 0.5936, + "step": 1595 + }, + { + "epoch": 0.12, + "grad_norm": 1.433319481331774, + "learning_rate": 1.9542120218425062e-05, + "loss": 0.7274, + "step": 1596 + }, + { + "epoch": 0.12, + "grad_norm": 1.4207225782495838, + "learning_rate": 1.954136829913668e-05, + "loss": 0.7697, + "step": 1597 + }, + { + "epoch": 0.12, + "grad_norm": 1.172925452859515, + "learning_rate": 1.954061577745249e-05, + "loss": 0.6404, + "step": 1598 + }, + { + "epoch": 0.12, + "grad_norm": 1.3739292570695558, + "learning_rate": 1.9539862653419998e-05, + "loss": 0.707, + "step": 1599 + }, + { + "epoch": 0.12, + "grad_norm": 1.2791762433433942, + "learning_rate": 1.9539108927086762e-05, + "loss": 0.6349, + "step": 1600 + }, + { + "epoch": 0.12, + "grad_norm": 1.41234982291409, + "learning_rate": 1.9538354598500358e-05, + "loss": 0.6279, + "step": 1601 + }, + { + "epoch": 0.12, + "grad_norm": 1.3472062396164377, + "learning_rate": 1.9537599667708423e-05, + "loss": 0.6537, + "step": 1602 + }, + { + "epoch": 0.12, + "grad_norm": 1.3598745020295593, + "learning_rate": 1.953684413475861e-05, + "loss": 0.6575, + "step": 1603 + }, + { + "epoch": 0.12, + "grad_norm": 1.3492193030978572, + "learning_rate": 1.9536087999698624e-05, + "loss": 0.6921, + "step": 1604 + }, + { + "epoch": 0.12, + "grad_norm": 1.3350143366126004, + "learning_rate": 1.9535331262576203e-05, + "loss": 0.6351, + "step": 1605 + }, + { + "epoch": 0.12, + "grad_norm": 1.3007999453189176, + "learning_rate": 1.9534573923439124e-05, + "loss": 0.6497, + "step": 1606 + }, + { + "epoch": 0.12, + "grad_norm": 1.4242711261068313, + "learning_rate": 1.95338159823352e-05, + "loss": 0.6587, + "step": 1607 + }, + { + "epoch": 0.12, + "grad_norm": 1.3295646942483512, + "learning_rate": 1.9533057439312286e-05, + "loss": 0.6443, + "step": 1608 + }, + { + "epoch": 0.12, + "grad_norm": 1.285179253925957, + "learning_rate": 1.9532298294418272e-05, + "loss": 0.66, + "step": 1609 + }, + { + "epoch": 0.12, + "grad_norm": 1.4129158133786655, + "learning_rate": 1.9531538547701087e-05, + "loss": 0.6448, + "step": 1610 + }, + { + "epoch": 0.12, + "grad_norm": 1.3419376393850617, + "learning_rate": 1.9530778199208698e-05, + "loss": 0.6247, + "step": 1611 + }, + { + "epoch": 0.13, + "grad_norm": 1.3834339189536926, + "learning_rate": 1.9530017248989107e-05, + "loss": 0.6187, + "step": 1612 + }, + { + "epoch": 0.13, + "grad_norm": 1.4451985082264414, + "learning_rate": 1.9529255697090358e-05, + "loss": 0.6755, + "step": 1613 + }, + { + "epoch": 0.13, + "grad_norm": 1.349543347655055, + "learning_rate": 1.9528493543560533e-05, + "loss": 0.6281, + "step": 1614 + }, + { + "epoch": 0.13, + "grad_norm": 1.3757705696726843, + "learning_rate": 1.9527730788447752e-05, + "loss": 0.6459, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 1.2725048277188138, + "learning_rate": 1.952696743180017e-05, + "loss": 0.6539, + "step": 1616 + }, + { + "epoch": 0.13, + "grad_norm": 1.4875827862438296, + "learning_rate": 1.952620347366598e-05, + "loss": 0.6947, + "step": 1617 + }, + { + "epoch": 0.13, + "grad_norm": 1.494804284799022, + "learning_rate": 1.952543891409341e-05, + "loss": 0.6922, + "step": 1618 + }, + { + "epoch": 0.13, + "grad_norm": 1.3048225835436218, + "learning_rate": 1.9524673753130745e-05, + "loss": 0.6317, + "step": 1619 + }, + { + "epoch": 0.13, + "grad_norm": 1.456086780749599, + "learning_rate": 1.952390799082628e-05, + "loss": 0.717, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 1.3076789505198712, + "learning_rate": 1.952314162722837e-05, + "loss": 0.67, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 1.2647147025049956, + "learning_rate": 1.9522374662385396e-05, + "loss": 0.5876, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 1.495839149584217, + "learning_rate": 1.952160709634578e-05, + "loss": 0.7636, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 1.2971599219911996, + "learning_rate": 1.952083892915798e-05, + "loss": 0.6187, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 1.3695968324982255, + "learning_rate": 1.95200701608705e-05, + "loss": 0.6537, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 1.3637421274351307, + "learning_rate": 1.9519300791531874e-05, + "loss": 0.6864, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 1.2084629493560946, + "learning_rate": 1.9518530821190675e-05, + "loss": 0.6336, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 1.3328254099821017, + "learning_rate": 1.9517760249895518e-05, + "loss": 0.6069, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 1.226836107194319, + "learning_rate": 1.951698907769505e-05, + "loss": 0.6174, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 1.2896108674708435, + "learning_rate": 1.951621730463796e-05, + "loss": 0.6248, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 1.2564568648184522, + "learning_rate": 1.9515444930772972e-05, + "loss": 0.6375, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 1.3202292963480275, + "learning_rate": 1.9514671956148855e-05, + "loss": 0.6724, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 1.2474507085507114, + "learning_rate": 1.951389838081441e-05, + "loss": 0.6047, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 1.5826478109888913, + "learning_rate": 1.951312420481847e-05, + "loss": 0.7121, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 1.3152930649662653, + "learning_rate": 1.951234942820992e-05, + "loss": 0.6369, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 1.3218693780441315, + "learning_rate": 1.9511574051037672e-05, + "loss": 0.6749, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 1.2525439125632163, + "learning_rate": 1.9510798073350686e-05, + "loss": 0.6181, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 1.546422487597453, + "learning_rate": 1.9510021495197943e-05, + "loss": 0.6291, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 1.3831334145794631, + "learning_rate": 1.950924431662848e-05, + "loss": 0.7096, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 1.2982931082047735, + "learning_rate": 1.9508466537691363e-05, + "loss": 0.6685, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 1.2439292836006322, + "learning_rate": 1.9507688158435693e-05, + "loss": 0.6459, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 1.218063356690392, + "learning_rate": 1.950690917891062e-05, + "loss": 0.6122, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 1.2613990307746286, + "learning_rate": 1.950612959916532e-05, + "loss": 0.6282, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 1.2630363646925478, + "learning_rate": 1.9505349419249015e-05, + "loss": 0.6726, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 1.4247964880996937, + "learning_rate": 1.9504568639210956e-05, + "loss": 0.6326, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 1.3253855558164094, + "learning_rate": 1.9503787259100445e-05, + "loss": 0.6852, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 1.4069105504279644, + "learning_rate": 1.9503005278966808e-05, + "loss": 0.6913, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 1.3431000579321597, + "learning_rate": 1.9502222698859422e-05, + "loss": 0.6527, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 1.298649320986588, + "learning_rate": 1.950143951882769e-05, + "loss": 0.6676, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 1.2262062964783422, + "learning_rate": 1.9500655738921058e-05, + "loss": 0.5631, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 1.2051690807670927, + "learning_rate": 1.9499871359189017e-05, + "loss": 0.643, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 1.2912285379581527, + "learning_rate": 1.949908637968108e-05, + "loss": 0.6451, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 1.413028106876958, + "learning_rate": 1.9498300800446815e-05, + "loss": 0.6963, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 1.3135889622126984, + "learning_rate": 1.9497514621535815e-05, + "loss": 0.6241, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 1.2908234561438021, + "learning_rate": 1.9496727842997713e-05, + "loss": 0.6443, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 1.2428083487737023, + "learning_rate": 1.949594046488219e-05, + "loss": 0.6417, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 1.1918627255332315, + "learning_rate": 1.949515248723895e-05, + "loss": 0.5729, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 1.3364460295152696, + "learning_rate": 1.9494363910117745e-05, + "loss": 0.6751, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 1.3817888339476168, + "learning_rate": 1.9493574733568362e-05, + "loss": 0.6299, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 1.336331850262456, + "learning_rate": 1.9492784957640624e-05, + "loss": 0.6401, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 1.5394468650506685, + "learning_rate": 1.94919945823844e-05, + "loss": 0.6854, + "step": 1661 + }, + { + "epoch": 0.13, + "grad_norm": 1.3283612658213446, + "learning_rate": 1.949120360784958e-05, + "loss": 0.6765, + "step": 1662 + }, + { + "epoch": 0.13, + "grad_norm": 1.3156635668395658, + "learning_rate": 1.9490412034086112e-05, + "loss": 0.6036, + "step": 1663 + }, + { + "epoch": 0.13, + "grad_norm": 1.352655310747795, + "learning_rate": 1.9489619861143966e-05, + "loss": 0.6887, + "step": 1664 + }, + { + "epoch": 0.13, + "grad_norm": 1.3882399690809386, + "learning_rate": 1.948882708907316e-05, + "loss": 0.6455, + "step": 1665 + }, + { + "epoch": 0.13, + "grad_norm": 1.4230400594191541, + "learning_rate": 1.9488033717923744e-05, + "loss": 0.6678, + "step": 1666 + }, + { + "epoch": 0.13, + "grad_norm": 1.4792071681678847, + "learning_rate": 1.9487239747745807e-05, + "loss": 0.6669, + "step": 1667 + }, + { + "epoch": 0.13, + "grad_norm": 1.0823281772440976, + "learning_rate": 1.9486445178589477e-05, + "loss": 0.5852, + "step": 1668 + }, + { + "epoch": 0.13, + "grad_norm": 1.30532944470769, + "learning_rate": 1.9485650010504918e-05, + "loss": 0.6312, + "step": 1669 + }, + { + "epoch": 0.13, + "grad_norm": 1.207124132452917, + "learning_rate": 1.9484854243542336e-05, + "loss": 0.5761, + "step": 1670 + }, + { + "epoch": 0.13, + "grad_norm": 1.4307943759977249, + "learning_rate": 1.948405787775197e-05, + "loss": 0.6544, + "step": 1671 + }, + { + "epoch": 0.13, + "grad_norm": 1.2773409035560555, + "learning_rate": 1.94832609131841e-05, + "loss": 0.6917, + "step": 1672 + }, + { + "epoch": 0.13, + "grad_norm": 1.3656208326332877, + "learning_rate": 1.948246334988904e-05, + "loss": 0.6261, + "step": 1673 + }, + { + "epoch": 0.13, + "grad_norm": 1.3661389498937764, + "learning_rate": 1.9481665187917147e-05, + "loss": 0.6416, + "step": 1674 + }, + { + "epoch": 0.13, + "grad_norm": 1.3336420248240874, + "learning_rate": 1.9480866427318807e-05, + "loss": 0.638, + "step": 1675 + }, + { + "epoch": 0.13, + "grad_norm": 1.3478522116782916, + "learning_rate": 1.9480067068144458e-05, + "loss": 0.6766, + "step": 1676 + }, + { + "epoch": 0.13, + "grad_norm": 1.3819711140541093, + "learning_rate": 1.9479267110444566e-05, + "loss": 0.6582, + "step": 1677 + }, + { + "epoch": 0.13, + "grad_norm": 1.3340384834006052, + "learning_rate": 1.9478466554269633e-05, + "loss": 0.5807, + "step": 1678 + }, + { + "epoch": 0.13, + "grad_norm": 1.2321214506212914, + "learning_rate": 1.94776653996702e-05, + "loss": 0.6527, + "step": 1679 + }, + { + "epoch": 0.13, + "grad_norm": 1.42304382910286, + "learning_rate": 1.947686364669686e-05, + "loss": 0.7393, + "step": 1680 + }, + { + "epoch": 0.13, + "grad_norm": 1.351307883562319, + "learning_rate": 1.9476061295400218e-05, + "loss": 0.6712, + "step": 1681 + }, + { + "epoch": 0.13, + "grad_norm": 1.2371071625458785, + "learning_rate": 1.947525834583094e-05, + "loss": 0.6414, + "step": 1682 + }, + { + "epoch": 0.13, + "grad_norm": 1.3435047724319267, + "learning_rate": 1.9474454798039713e-05, + "loss": 0.6386, + "step": 1683 + }, + { + "epoch": 0.13, + "grad_norm": 1.4039604622335615, + "learning_rate": 1.9473650652077275e-05, + "loss": 0.6874, + "step": 1684 + }, + { + "epoch": 0.13, + "grad_norm": 1.3784077205135106, + "learning_rate": 1.9472845907994397e-05, + "loss": 0.6332, + "step": 1685 + }, + { + "epoch": 0.13, + "grad_norm": 1.2000282860442137, + "learning_rate": 1.9472040565841877e-05, + "loss": 0.5581, + "step": 1686 + }, + { + "epoch": 0.13, + "grad_norm": 1.4381181797494187, + "learning_rate": 1.947123462567057e-05, + "loss": 0.6511, + "step": 1687 + }, + { + "epoch": 0.13, + "grad_norm": 1.412294747622676, + "learning_rate": 1.9470428087531354e-05, + "loss": 0.6266, + "step": 1688 + }, + { + "epoch": 0.13, + "grad_norm": 1.3290537559998392, + "learning_rate": 1.9469620951475154e-05, + "loss": 0.6643, + "step": 1689 + }, + { + "epoch": 0.13, + "grad_norm": 1.2301033567407134, + "learning_rate": 1.9468813217552926e-05, + "loss": 0.6454, + "step": 1690 + }, + { + "epoch": 0.13, + "grad_norm": 1.2913551518923005, + "learning_rate": 1.9468004885815667e-05, + "loss": 0.6124, + "step": 1691 + }, + { + "epoch": 0.13, + "grad_norm": 1.2024786563018028, + "learning_rate": 1.946719595631441e-05, + "loss": 0.5772, + "step": 1692 + }, + { + "epoch": 0.13, + "grad_norm": 1.4841037703217972, + "learning_rate": 1.946638642910023e-05, + "loss": 0.6568, + "step": 1693 + }, + { + "epoch": 0.13, + "grad_norm": 1.3363512971057134, + "learning_rate": 1.9465576304224233e-05, + "loss": 0.6735, + "step": 1694 + }, + { + "epoch": 0.13, + "grad_norm": 1.3300614545004312, + "learning_rate": 1.9464765581737567e-05, + "loss": 0.6477, + "step": 1695 + }, + { + "epoch": 0.13, + "grad_norm": 1.3342649910313202, + "learning_rate": 1.946395426169142e-05, + "loss": 0.6188, + "step": 1696 + }, + { + "epoch": 0.13, + "grad_norm": 1.356269709711965, + "learning_rate": 1.9463142344137013e-05, + "loss": 0.6254, + "step": 1697 + }, + { + "epoch": 0.13, + "grad_norm": 1.2737688908846687, + "learning_rate": 1.9462329829125604e-05, + "loss": 0.6352, + "step": 1698 + }, + { + "epoch": 0.13, + "grad_norm": 1.2928672819861524, + "learning_rate": 1.9461516716708496e-05, + "loss": 0.6746, + "step": 1699 + }, + { + "epoch": 0.13, + "grad_norm": 1.2462527852232945, + "learning_rate": 1.9460703006937023e-05, + "loss": 0.6072, + "step": 1700 + }, + { + "epoch": 0.13, + "grad_norm": 1.3600213717436305, + "learning_rate": 1.9459888699862555e-05, + "loss": 0.6699, + "step": 1701 + }, + { + "epoch": 0.13, + "grad_norm": 1.304672241121708, + "learning_rate": 1.9459073795536513e-05, + "loss": 0.6614, + "step": 1702 + }, + { + "epoch": 0.13, + "grad_norm": 1.260479344892915, + "learning_rate": 1.9458258294010335e-05, + "loss": 0.5953, + "step": 1703 + }, + { + "epoch": 0.13, + "grad_norm": 1.4027928135425936, + "learning_rate": 1.9457442195335514e-05, + "loss": 0.6853, + "step": 1704 + }, + { + "epoch": 0.13, + "grad_norm": 1.243047116900357, + "learning_rate": 1.945662549956357e-05, + "loss": 0.6072, + "step": 1705 + }, + { + "epoch": 0.13, + "grad_norm": 1.2468337011548778, + "learning_rate": 1.945580820674607e-05, + "loss": 0.6176, + "step": 1706 + }, + { + "epoch": 0.13, + "grad_norm": 1.4600280542160484, + "learning_rate": 1.9454990316934618e-05, + "loss": 0.6828, + "step": 1707 + }, + { + "epoch": 0.13, + "grad_norm": 1.4361574288889705, + "learning_rate": 1.9454171830180842e-05, + "loss": 0.6607, + "step": 1708 + }, + { + "epoch": 0.13, + "grad_norm": 1.2911049123754939, + "learning_rate": 1.945335274653642e-05, + "loss": 0.656, + "step": 1709 + }, + { + "epoch": 0.13, + "grad_norm": 1.220834953819128, + "learning_rate": 1.9452533066053067e-05, + "loss": 0.5929, + "step": 1710 + }, + { + "epoch": 0.13, + "grad_norm": 1.2237992474131756, + "learning_rate": 1.9451712788782534e-05, + "loss": 0.5865, + "step": 1711 + }, + { + "epoch": 0.13, + "grad_norm": 1.386744969079819, + "learning_rate": 1.9450891914776605e-05, + "loss": 0.6868, + "step": 1712 + }, + { + "epoch": 0.13, + "grad_norm": 1.2609752436664927, + "learning_rate": 1.9450070444087113e-05, + "loss": 0.6056, + "step": 1713 + }, + { + "epoch": 0.13, + "grad_norm": 1.2892210429403175, + "learning_rate": 1.9449248376765918e-05, + "loss": 0.5985, + "step": 1714 + }, + { + "epoch": 0.13, + "grad_norm": 1.4446119930628774, + "learning_rate": 1.9448425712864917e-05, + "loss": 0.6983, + "step": 1715 + }, + { + "epoch": 0.13, + "grad_norm": 1.4259015594306956, + "learning_rate": 1.9447602452436058e-05, + "loss": 0.6322, + "step": 1716 + }, + { + "epoch": 0.13, + "grad_norm": 1.226230503514461, + "learning_rate": 1.944677859553131e-05, + "loss": 0.6451, + "step": 1717 + }, + { + "epoch": 0.13, + "grad_norm": 1.283456066174709, + "learning_rate": 1.944595414220269e-05, + "loss": 0.6641, + "step": 1718 + }, + { + "epoch": 0.13, + "grad_norm": 1.3642209811962993, + "learning_rate": 1.9445129092502253e-05, + "loss": 0.6395, + "step": 1719 + }, + { + "epoch": 0.13, + "grad_norm": 1.245708441879969, + "learning_rate": 1.9444303446482086e-05, + "loss": 0.6451, + "step": 1720 + }, + { + "epoch": 0.13, + "grad_norm": 1.3245842353327342, + "learning_rate": 1.9443477204194315e-05, + "loss": 0.6655, + "step": 1721 + }, + { + "epoch": 0.13, + "grad_norm": 1.2715552056854047, + "learning_rate": 1.9442650365691102e-05, + "loss": 0.6386, + "step": 1722 + }, + { + "epoch": 0.13, + "grad_norm": 1.3462189459517546, + "learning_rate": 1.944182293102466e-05, + "loss": 0.6391, + "step": 1723 + }, + { + "epoch": 0.13, + "grad_norm": 1.2734904366004214, + "learning_rate": 1.944099490024722e-05, + "loss": 0.6149, + "step": 1724 + }, + { + "epoch": 0.13, + "grad_norm": 1.3659556043232803, + "learning_rate": 1.9440166273411062e-05, + "loss": 0.6347, + "step": 1725 + }, + { + "epoch": 0.13, + "grad_norm": 1.2292438525949423, + "learning_rate": 1.9439337050568504e-05, + "loss": 0.5649, + "step": 1726 + }, + { + "epoch": 0.13, + "grad_norm": 1.2587119257149153, + "learning_rate": 1.9438507231771897e-05, + "loss": 0.6533, + "step": 1727 + }, + { + "epoch": 0.13, + "grad_norm": 1.3552559457439861, + "learning_rate": 1.9437676817073635e-05, + "loss": 0.6966, + "step": 1728 + }, + { + "epoch": 0.13, + "grad_norm": 1.317781448848758, + "learning_rate": 1.9436845806526137e-05, + "loss": 0.6053, + "step": 1729 + }, + { + "epoch": 0.13, + "grad_norm": 1.218933091715898, + "learning_rate": 1.943601420018188e-05, + "loss": 0.6241, + "step": 1730 + }, + { + "epoch": 0.13, + "grad_norm": 1.2819330674164047, + "learning_rate": 1.9435181998093363e-05, + "loss": 0.5859, + "step": 1731 + }, + { + "epoch": 0.13, + "grad_norm": 1.2979205868796004, + "learning_rate": 1.9434349200313126e-05, + "loss": 0.6189, + "step": 1732 + }, + { + "epoch": 0.13, + "grad_norm": 1.2631881236027205, + "learning_rate": 1.9433515806893754e-05, + "loss": 0.6205, + "step": 1733 + }, + { + "epoch": 0.13, + "grad_norm": 1.211010789191571, + "learning_rate": 1.9432681817887854e-05, + "loss": 0.6586, + "step": 1734 + }, + { + "epoch": 0.13, + "grad_norm": 1.3550807166603973, + "learning_rate": 1.943184723334808e-05, + "loss": 0.6747, + "step": 1735 + }, + { + "epoch": 0.13, + "grad_norm": 1.325161803460716, + "learning_rate": 1.9431012053327135e-05, + "loss": 0.6424, + "step": 1736 + }, + { + "epoch": 0.13, + "grad_norm": 1.2578420694796821, + "learning_rate": 1.943017627787774e-05, + "loss": 0.6117, + "step": 1737 + }, + { + "epoch": 0.13, + "grad_norm": 1.2846426821764276, + "learning_rate": 1.9429339907052666e-05, + "loss": 0.6048, + "step": 1738 + }, + { + "epoch": 0.13, + "grad_norm": 1.3351920652636022, + "learning_rate": 1.942850294090471e-05, + "loss": 0.6545, + "step": 1739 + }, + { + "epoch": 0.13, + "grad_norm": 1.2275681473253612, + "learning_rate": 1.942766537948672e-05, + "loss": 0.5942, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 1.448747913138571, + "learning_rate": 1.9426827222851576e-05, + "loss": 0.6672, + "step": 1741 + }, + { + "epoch": 0.14, + "grad_norm": 1.3551998256423143, + "learning_rate": 1.942598847105219e-05, + "loss": 0.7107, + "step": 1742 + }, + { + "epoch": 0.14, + "grad_norm": 1.2442038144244327, + "learning_rate": 1.9425149124141524e-05, + "loss": 0.584, + "step": 1743 + }, + { + "epoch": 0.14, + "grad_norm": 1.4410549655837623, + "learning_rate": 1.9424309182172563e-05, + "loss": 0.6376, + "step": 1744 + }, + { + "epoch": 0.14, + "grad_norm": 1.3702917116995883, + "learning_rate": 1.9423468645198342e-05, + "loss": 0.6489, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 1.3973419757274244, + "learning_rate": 1.9422627513271927e-05, + "loss": 0.6646, + "step": 1746 + }, + { + "epoch": 0.14, + "grad_norm": 1.2005454313967445, + "learning_rate": 1.9421785786446422e-05, + "loss": 0.6064, + "step": 1747 + }, + { + "epoch": 0.14, + "grad_norm": 1.4382653065076993, + "learning_rate": 1.942094346477497e-05, + "loss": 0.633, + "step": 1748 + }, + { + "epoch": 0.14, + "grad_norm": 1.2985161657638071, + "learning_rate": 1.9420100548310753e-05, + "loss": 0.5816, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 1.2276991418873875, + "learning_rate": 1.9419257037106983e-05, + "loss": 0.6184, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 1.321233168813278, + "learning_rate": 1.9418412931216926e-05, + "loss": 0.6215, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 1.2310511104066357, + "learning_rate": 1.9417568230693863e-05, + "loss": 0.6148, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 1.201918951174184, + "learning_rate": 1.941672293559113e-05, + "loss": 0.6409, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 1.2820192679357278, + "learning_rate": 1.9415877045962093e-05, + "loss": 0.6141, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 1.4418860983287094, + "learning_rate": 1.941503056186016e-05, + "loss": 0.6538, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 1.5504665103409105, + "learning_rate": 1.9414183483338777e-05, + "loss": 0.7011, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 1.3230350121055532, + "learning_rate": 1.9413335810451412e-05, + "loss": 0.6637, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 1.2218971211413936, + "learning_rate": 1.9412487543251596e-05, + "loss": 0.701, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 1.2913811378322062, + "learning_rate": 1.941163868179288e-05, + "loss": 0.6714, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 1.2373744399981528, + "learning_rate": 1.9410789226128853e-05, + "loss": 0.6462, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 1.385578385095075, + "learning_rate": 1.940993917631315e-05, + "loss": 0.6733, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 1.385045420111922, + "learning_rate": 1.9409088532399436e-05, + "loss": 0.6381, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 1.2244025486197705, + "learning_rate": 1.9408237294441422e-05, + "loss": 0.6123, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 1.3612472969918086, + "learning_rate": 1.9407385462492846e-05, + "loss": 0.6183, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 1.3186129403867841, + "learning_rate": 1.9406533036607488e-05, + "loss": 0.6606, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 1.2598968674065567, + "learning_rate": 1.940568001683917e-05, + "loss": 0.6348, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 1.3952888119385922, + "learning_rate": 1.9404826403241746e-05, + "loss": 0.6662, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 1.206270589306881, + "learning_rate": 1.940397219586911e-05, + "loss": 0.5745, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 1.2370139294474551, + "learning_rate": 1.940311739477519e-05, + "loss": 0.591, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 1.3182917824831164, + "learning_rate": 1.940226200001395e-05, + "loss": 0.6754, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 1.412846585178781, + "learning_rate": 1.940140601163941e-05, + "loss": 0.6588, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 1.3167438470375483, + "learning_rate": 1.9400549429705597e-05, + "loss": 0.6503, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 1.3200466633264807, + "learning_rate": 1.93996922542666e-05, + "loss": 0.6787, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 1.230184564621934, + "learning_rate": 1.9398834485376534e-05, + "loss": 0.6585, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 1.303266882547969, + "learning_rate": 1.9397976123089558e-05, + "loss": 0.6916, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 1.202594541057109, + "learning_rate": 1.9397117167459858e-05, + "loss": 0.6339, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 1.2912817145530682, + "learning_rate": 1.939625761854167e-05, + "loss": 0.6583, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 1.2899321425872985, + "learning_rate": 1.9395397476389265e-05, + "loss": 0.5833, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 1.1868743754260211, + "learning_rate": 1.939453674105694e-05, + "loss": 0.6209, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 1.3231568256167092, + "learning_rate": 1.9393675412599037e-05, + "loss": 0.6849, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 1.2318035333767627, + "learning_rate": 1.9392813491069944e-05, + "loss": 0.5975, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 1.35401319465519, + "learning_rate": 1.9391950976524075e-05, + "loss": 0.6275, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 1.3256753657807507, + "learning_rate": 1.9391087869015884e-05, + "loss": 0.6601, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 1.431278365074837, + "learning_rate": 1.9390224168599864e-05, + "loss": 0.7256, + "step": 1784 + }, + { + "epoch": 0.14, + "grad_norm": 1.3296045028657328, + "learning_rate": 1.9389359875330548e-05, + "loss": 0.6998, + "step": 1785 + }, + { + "epoch": 0.14, + "grad_norm": 1.3195173589179643, + "learning_rate": 1.9388494989262498e-05, + "loss": 0.6338, + "step": 1786 + }, + { + "epoch": 0.14, + "grad_norm": 1.2677220055692577, + "learning_rate": 1.9387629510450318e-05, + "loss": 0.6223, + "step": 1787 + }, + { + "epoch": 0.14, + "grad_norm": 1.2939126525211937, + "learning_rate": 1.9386763438948656e-05, + "loss": 0.6535, + "step": 1788 + }, + { + "epoch": 0.14, + "grad_norm": 1.379490153281343, + "learning_rate": 1.938589677481219e-05, + "loss": 0.6291, + "step": 1789 + }, + { + "epoch": 0.14, + "grad_norm": 1.3741811568261744, + "learning_rate": 1.9385029518095635e-05, + "loss": 0.6691, + "step": 1790 + }, + { + "epoch": 0.14, + "grad_norm": 1.2312748282006032, + "learning_rate": 1.9384161668853746e-05, + "loss": 0.5783, + "step": 1791 + }, + { + "epoch": 0.14, + "grad_norm": 1.3642264426030555, + "learning_rate": 1.9383293227141316e-05, + "loss": 0.6339, + "step": 1792 + }, + { + "epoch": 0.14, + "grad_norm": 1.1622154892630587, + "learning_rate": 1.9382424193013172e-05, + "loss": 0.5877, + "step": 1793 + }, + { + "epoch": 0.14, + "grad_norm": 1.3390573134260653, + "learning_rate": 1.938155456652418e-05, + "loss": 0.627, + "step": 1794 + }, + { + "epoch": 0.14, + "grad_norm": 1.3374954419637717, + "learning_rate": 1.938068434772925e-05, + "loss": 0.6592, + "step": 1795 + }, + { + "epoch": 0.14, + "grad_norm": 1.3315944458845368, + "learning_rate": 1.9379813536683314e-05, + "loss": 0.709, + "step": 1796 + }, + { + "epoch": 0.14, + "grad_norm": 1.3022277395916675, + "learning_rate": 1.9378942133441357e-05, + "loss": 0.683, + "step": 1797 + }, + { + "epoch": 0.14, + "grad_norm": 1.357890008573226, + "learning_rate": 1.9378070138058396e-05, + "loss": 0.6939, + "step": 1798 + }, + { + "epoch": 0.14, + "grad_norm": 1.3082880830330057, + "learning_rate": 1.9377197550589484e-05, + "loss": 0.6154, + "step": 1799 + }, + { + "epoch": 0.14, + "grad_norm": 1.3562562617144207, + "learning_rate": 1.9376324371089707e-05, + "loss": 0.6646, + "step": 1800 + }, + { + "epoch": 0.14, + "grad_norm": 1.3542960863135862, + "learning_rate": 1.93754505996142e-05, + "loss": 0.672, + "step": 1801 + }, + { + "epoch": 0.14, + "grad_norm": 1.2775218501154013, + "learning_rate": 1.9374576236218123e-05, + "loss": 0.631, + "step": 1802 + }, + { + "epoch": 0.14, + "grad_norm": 1.1758775639928287, + "learning_rate": 1.9373701280956685e-05, + "loss": 0.5294, + "step": 1803 + }, + { + "epoch": 0.14, + "grad_norm": 1.2517750058397301, + "learning_rate": 1.937282573388512e-05, + "loss": 0.6509, + "step": 1804 + }, + { + "epoch": 0.14, + "grad_norm": 1.1802088109840623, + "learning_rate": 1.937194959505871e-05, + "loss": 0.6129, + "step": 1805 + }, + { + "epoch": 0.14, + "grad_norm": 1.3102995273813116, + "learning_rate": 1.937107286453277e-05, + "loss": 0.6104, + "step": 1806 + }, + { + "epoch": 0.14, + "grad_norm": 1.3008657891922533, + "learning_rate": 1.9370195542362653e-05, + "loss": 0.6734, + "step": 1807 + }, + { + "epoch": 0.14, + "grad_norm": 1.2025573183860743, + "learning_rate": 1.9369317628603744e-05, + "loss": 0.599, + "step": 1808 + }, + { + "epoch": 0.14, + "grad_norm": 1.3371318363907538, + "learning_rate": 1.936843912331148e-05, + "loss": 0.6318, + "step": 1809 + }, + { + "epoch": 0.14, + "grad_norm": 1.284488678516641, + "learning_rate": 1.936756002654131e-05, + "loss": 0.6081, + "step": 1810 + }, + { + "epoch": 0.14, + "grad_norm": 1.217664577632365, + "learning_rate": 1.9366680338348755e-05, + "loss": 0.5891, + "step": 1811 + }, + { + "epoch": 0.14, + "grad_norm": 1.186817876757914, + "learning_rate": 1.9365800058789338e-05, + "loss": 0.5698, + "step": 1812 + }, + { + "epoch": 0.14, + "grad_norm": 1.2439156752613698, + "learning_rate": 1.9364919187918644e-05, + "loss": 0.6373, + "step": 1813 + }, + { + "epoch": 0.14, + "grad_norm": 1.3006422638744117, + "learning_rate": 1.9364037725792287e-05, + "loss": 0.6787, + "step": 1814 + }, + { + "epoch": 0.14, + "grad_norm": 1.2967798588795363, + "learning_rate": 1.9363155672465913e-05, + "loss": 0.6798, + "step": 1815 + }, + { + "epoch": 0.14, + "grad_norm": 1.2913093636706705, + "learning_rate": 1.9362273027995217e-05, + "loss": 0.585, + "step": 1816 + }, + { + "epoch": 0.14, + "grad_norm": 1.2658277749464204, + "learning_rate": 1.9361389792435918e-05, + "loss": 0.6483, + "step": 1817 + }, + { + "epoch": 0.14, + "grad_norm": 1.2531786081924103, + "learning_rate": 1.9360505965843784e-05, + "loss": 0.6083, + "step": 1818 + }, + { + "epoch": 0.14, + "grad_norm": 1.1847368517965333, + "learning_rate": 1.9359621548274616e-05, + "loss": 0.6145, + "step": 1819 + }, + { + "epoch": 0.14, + "grad_norm": 1.2516421022472517, + "learning_rate": 1.935873653978425e-05, + "loss": 0.6707, + "step": 1820 + }, + { + "epoch": 0.14, + "grad_norm": 1.3681852786960234, + "learning_rate": 1.935785094042856e-05, + "loss": 0.697, + "step": 1821 + }, + { + "epoch": 0.14, + "grad_norm": 1.351966136130191, + "learning_rate": 1.9356964750263464e-05, + "loss": 0.633, + "step": 1822 + }, + { + "epoch": 0.14, + "grad_norm": 1.2821431187296615, + "learning_rate": 1.9356077969344904e-05, + "loss": 0.6545, + "step": 1823 + }, + { + "epoch": 0.14, + "grad_norm": 1.31030953499522, + "learning_rate": 1.9355190597728874e-05, + "loss": 0.6607, + "step": 1824 + }, + { + "epoch": 0.14, + "grad_norm": 1.2535364669346227, + "learning_rate": 1.9354302635471392e-05, + "loss": 0.5522, + "step": 1825 + }, + { + "epoch": 0.14, + "grad_norm": 1.3087551643685043, + "learning_rate": 1.9353414082628523e-05, + "loss": 0.6121, + "step": 1826 + }, + { + "epoch": 0.14, + "grad_norm": 1.4113829288775175, + "learning_rate": 1.9352524939256367e-05, + "loss": 0.637, + "step": 1827 + }, + { + "epoch": 0.14, + "grad_norm": 1.4263114883849193, + "learning_rate": 1.935163520541106e-05, + "loss": 0.63, + "step": 1828 + }, + { + "epoch": 0.14, + "grad_norm": 1.2431996858082133, + "learning_rate": 1.935074488114877e-05, + "loss": 0.6011, + "step": 1829 + }, + { + "epoch": 0.14, + "grad_norm": 1.3271925401692968, + "learning_rate": 1.9349853966525715e-05, + "loss": 0.7236, + "step": 1830 + }, + { + "epoch": 0.14, + "grad_norm": 1.2365950406265822, + "learning_rate": 1.934896246159814e-05, + "loss": 0.64, + "step": 1831 + }, + { + "epoch": 0.14, + "grad_norm": 1.2583187814001624, + "learning_rate": 1.9348070366422333e-05, + "loss": 0.6623, + "step": 1832 + }, + { + "epoch": 0.14, + "grad_norm": 1.2761617902410398, + "learning_rate": 1.934717768105461e-05, + "loss": 0.5737, + "step": 1833 + }, + { + "epoch": 0.14, + "grad_norm": 1.3179462152803372, + "learning_rate": 1.9346284405551333e-05, + "loss": 0.6379, + "step": 1834 + }, + { + "epoch": 0.14, + "grad_norm": 1.2759810712271338, + "learning_rate": 1.9345390539968907e-05, + "loss": 0.6245, + "step": 1835 + }, + { + "epoch": 0.14, + "grad_norm": 1.455047975778258, + "learning_rate": 1.934449608436376e-05, + "loss": 0.6904, + "step": 1836 + }, + { + "epoch": 0.14, + "grad_norm": 1.3158179533536536, + "learning_rate": 1.934360103879236e-05, + "loss": 0.6602, + "step": 1837 + }, + { + "epoch": 0.14, + "grad_norm": 1.1974933152619458, + "learning_rate": 1.934270540331122e-05, + "loss": 0.6181, + "step": 1838 + }, + { + "epoch": 0.14, + "grad_norm": 1.404012086134293, + "learning_rate": 1.934180917797689e-05, + "loss": 0.5962, + "step": 1839 + }, + { + "epoch": 0.14, + "grad_norm": 1.2609085930556203, + "learning_rate": 1.9340912362845946e-05, + "loss": 0.613, + "step": 1840 + }, + { + "epoch": 0.14, + "grad_norm": 1.2277558953841405, + "learning_rate": 1.934001495797501e-05, + "loss": 0.6423, + "step": 1841 + }, + { + "epoch": 0.14, + "grad_norm": 1.369381914630601, + "learning_rate": 1.9339116963420746e-05, + "loss": 0.6398, + "step": 1842 + }, + { + "epoch": 0.14, + "grad_norm": 1.249633306123298, + "learning_rate": 1.9338218379239842e-05, + "loss": 0.6147, + "step": 1843 + }, + { + "epoch": 0.14, + "grad_norm": 1.3382573202992287, + "learning_rate": 1.9337319205489035e-05, + "loss": 0.6244, + "step": 1844 + }, + { + "epoch": 0.14, + "grad_norm": 1.1796217135850182, + "learning_rate": 1.933641944222509e-05, + "loss": 0.6093, + "step": 1845 + }, + { + "epoch": 0.14, + "grad_norm": 1.260914927378649, + "learning_rate": 1.9335519089504816e-05, + "loss": 0.5725, + "step": 1846 + }, + { + "epoch": 0.14, + "grad_norm": 1.4294441724818887, + "learning_rate": 1.933461814738506e-05, + "loss": 0.7315, + "step": 1847 + }, + { + "epoch": 0.14, + "grad_norm": 1.607083587461845, + "learning_rate": 1.9333716615922696e-05, + "loss": 0.6928, + "step": 1848 + }, + { + "epoch": 0.14, + "grad_norm": 1.2153931874773367, + "learning_rate": 1.933281449517465e-05, + "loss": 0.5779, + "step": 1849 + }, + { + "epoch": 0.14, + "grad_norm": 1.3373316578675485, + "learning_rate": 1.933191178519787e-05, + "loss": 0.7226, + "step": 1850 + }, + { + "epoch": 0.14, + "grad_norm": 1.370734925878231, + "learning_rate": 1.9331008486049355e-05, + "loss": 0.684, + "step": 1851 + }, + { + "epoch": 0.14, + "grad_norm": 1.4186321629729854, + "learning_rate": 1.933010459778613e-05, + "loss": 0.6748, + "step": 1852 + }, + { + "epoch": 0.14, + "grad_norm": 1.187836047357897, + "learning_rate": 1.9329200120465268e-05, + "loss": 0.6663, + "step": 1853 + }, + { + "epoch": 0.14, + "grad_norm": 1.2605960445076303, + "learning_rate": 1.932829505414387e-05, + "loss": 0.552, + "step": 1854 + }, + { + "epoch": 0.14, + "grad_norm": 1.337893476274334, + "learning_rate": 1.9327389398879078e-05, + "loss": 0.6848, + "step": 1855 + }, + { + "epoch": 0.14, + "grad_norm": 1.4702630062063125, + "learning_rate": 1.9326483154728066e-05, + "loss": 0.676, + "step": 1856 + }, + { + "epoch": 0.14, + "grad_norm": 1.341186162348423, + "learning_rate": 1.9325576321748058e-05, + "loss": 0.7112, + "step": 1857 + }, + { + "epoch": 0.14, + "grad_norm": 1.2551070313578359, + "learning_rate": 1.9324668899996305e-05, + "loss": 0.6449, + "step": 1858 + }, + { + "epoch": 0.14, + "grad_norm": 1.140458708882373, + "learning_rate": 1.932376088953009e-05, + "loss": 0.6111, + "step": 1859 + }, + { + "epoch": 0.14, + "grad_norm": 1.2644659315703413, + "learning_rate": 1.932285229040675e-05, + "loss": 0.631, + "step": 1860 + }, + { + "epoch": 0.14, + "grad_norm": 1.348570450011457, + "learning_rate": 1.9321943102683642e-05, + "loss": 0.6777, + "step": 1861 + }, + { + "epoch": 0.14, + "grad_norm": 1.4518527131950292, + "learning_rate": 1.9321033326418174e-05, + "loss": 0.6513, + "step": 1862 + }, + { + "epoch": 0.14, + "grad_norm": 1.328931451431817, + "learning_rate": 1.932012296166778e-05, + "loss": 0.6438, + "step": 1863 + }, + { + "epoch": 0.14, + "grad_norm": 1.4358748495985703, + "learning_rate": 1.931921200848994e-05, + "loss": 0.6652, + "step": 1864 + }, + { + "epoch": 0.14, + "grad_norm": 1.3893122531108628, + "learning_rate": 1.931830046694216e-05, + "loss": 0.6307, + "step": 1865 + }, + { + "epoch": 0.14, + "grad_norm": 1.244040492652964, + "learning_rate": 1.9317388337082002e-05, + "loss": 0.5596, + "step": 1866 + }, + { + "epoch": 0.14, + "grad_norm": 1.2629449513505753, + "learning_rate": 1.9316475618967045e-05, + "loss": 0.677, + "step": 1867 + }, + { + "epoch": 0.14, + "grad_norm": 1.2777468077504603, + "learning_rate": 1.9315562312654912e-05, + "loss": 0.643, + "step": 1868 + }, + { + "epoch": 0.14, + "grad_norm": 1.381660714514366, + "learning_rate": 1.931464841820327e-05, + "loss": 0.6704, + "step": 1869 + }, + { + "epoch": 0.15, + "grad_norm": 1.1949993546137005, + "learning_rate": 1.9313733935669817e-05, + "loss": 0.5983, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 1.3454682653193897, + "learning_rate": 1.9312818865112288e-05, + "loss": 0.6332, + "step": 1871 + }, + { + "epoch": 0.15, + "grad_norm": 1.2397029188585333, + "learning_rate": 1.9311903206588455e-05, + "loss": 0.5938, + "step": 1872 + }, + { + "epoch": 0.15, + "grad_norm": 1.2834543014255149, + "learning_rate": 1.9310986960156132e-05, + "loss": 0.5874, + "step": 1873 + }, + { + "epoch": 0.15, + "grad_norm": 1.257630945217229, + "learning_rate": 1.931007012587316e-05, + "loss": 0.5996, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 1.409777772787156, + "learning_rate": 1.930915270379743e-05, + "loss": 0.6249, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 1.3882650001961634, + "learning_rate": 1.930823469398686e-05, + "loss": 0.7336, + "step": 1876 + }, + { + "epoch": 0.15, + "grad_norm": 1.3206648948843052, + "learning_rate": 1.9307316096499412e-05, + "loss": 0.6249, + "step": 1877 + }, + { + "epoch": 0.15, + "grad_norm": 1.2694585221806376, + "learning_rate": 1.9306396911393076e-05, + "loss": 0.5818, + "step": 1878 + }, + { + "epoch": 0.15, + "grad_norm": 1.3495558467247493, + "learning_rate": 1.930547713872589e-05, + "loss": 0.6692, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 1.1109811143857988, + "learning_rate": 1.930455677855592e-05, + "loss": 0.6271, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 1.3275951001573685, + "learning_rate": 1.930363583094128e-05, + "loss": 0.6553, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 1.4158037119179134, + "learning_rate": 1.9302714295940107e-05, + "loss": 0.638, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 1.2703026408696196, + "learning_rate": 1.9301792173610584e-05, + "loss": 0.6525, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 1.331144468396001, + "learning_rate": 1.930086946401093e-05, + "loss": 0.628, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 1.1939521977809402, + "learning_rate": 1.9299946167199405e-05, + "loss": 0.6167, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 1.2909777198570933, + "learning_rate": 1.9299022283234288e-05, + "loss": 0.6617, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 1.3197922895029108, + "learning_rate": 1.9298097812173926e-05, + "loss": 0.6428, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 1.254458439536501, + "learning_rate": 1.9297172754076677e-05, + "loss": 0.5862, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 1.264577879833715, + "learning_rate": 1.929624710900094e-05, + "loss": 0.6829, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 1.2316664422327095, + "learning_rate": 1.9295320877005163e-05, + "loss": 0.62, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 1.3698721984088331, + "learning_rate": 1.9294394058147822e-05, + "loss": 0.6855, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 1.2304175956628696, + "learning_rate": 1.929346665248743e-05, + "loss": 0.5959, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 1.3053276638678097, + "learning_rate": 1.9292538660082543e-05, + "loss": 0.6279, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 1.1651293071318936, + "learning_rate": 1.9291610080991743e-05, + "loss": 0.6432, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 1.2475735956473255, + "learning_rate": 1.9290680915273662e-05, + "loss": 0.6422, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 1.279957421682411, + "learning_rate": 1.928975116298696e-05, + "loss": 0.637, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 1.3006048226063103, + "learning_rate": 1.928882082419034e-05, + "loss": 0.6616, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 1.244368791243786, + "learning_rate": 1.9287889898942537e-05, + "loss": 0.7127, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 1.269686363721072, + "learning_rate": 1.9286958387302327e-05, + "loss": 0.61, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 1.3600335115623554, + "learning_rate": 1.928602628932852e-05, + "loss": 0.6265, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 1.2977633363747276, + "learning_rate": 1.928509360507996e-05, + "loss": 0.6553, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 1.243239383258666, + "learning_rate": 1.928416033461554e-05, + "loss": 0.6289, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 1.2940684364328232, + "learning_rate": 1.9283226477994173e-05, + "loss": 0.5865, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 1.2638571366081577, + "learning_rate": 1.928229203527483e-05, + "loss": 0.6266, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 1.2164493756498795, + "learning_rate": 1.9281357006516496e-05, + "loss": 0.6113, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 1.3877431175378758, + "learning_rate": 1.928042139177821e-05, + "loss": 0.7048, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 1.2450651988227195, + "learning_rate": 1.9279485191119042e-05, + "loss": 0.58, + "step": 1907 + }, + { + "epoch": 0.15, + "grad_norm": 1.356594574572605, + "learning_rate": 1.9278548404598094e-05, + "loss": 0.646, + "step": 1908 + }, + { + "epoch": 0.15, + "grad_norm": 1.3247773559216494, + "learning_rate": 1.9277611032274517e-05, + "loss": 0.6741, + "step": 1909 + }, + { + "epoch": 0.15, + "grad_norm": 1.3930549821676028, + "learning_rate": 1.927667307420749e-05, + "loss": 0.6613, + "step": 1910 + }, + { + "epoch": 0.15, + "grad_norm": 1.2741597941387712, + "learning_rate": 1.927573453045623e-05, + "loss": 0.6702, + "step": 1911 + }, + { + "epoch": 0.15, + "grad_norm": 1.3242502124390838, + "learning_rate": 1.9274795401079987e-05, + "loss": 0.6474, + "step": 1912 + }, + { + "epoch": 0.15, + "grad_norm": 1.3227753551232575, + "learning_rate": 1.9273855686138065e-05, + "loss": 0.6436, + "step": 1913 + }, + { + "epoch": 0.15, + "grad_norm": 1.2362758149324338, + "learning_rate": 1.927291538568979e-05, + "loss": 0.6379, + "step": 1914 + }, + { + "epoch": 0.15, + "grad_norm": 1.2282747114332857, + "learning_rate": 1.9271974499794518e-05, + "loss": 0.6129, + "step": 1915 + }, + { + "epoch": 0.15, + "grad_norm": 1.2991242264835519, + "learning_rate": 1.927103302851166e-05, + "loss": 0.6573, + "step": 1916 + }, + { + "epoch": 0.15, + "grad_norm": 1.257815959285931, + "learning_rate": 1.9270090971900653e-05, + "loss": 0.6114, + "step": 1917 + }, + { + "epoch": 0.15, + "grad_norm": 1.321936021122329, + "learning_rate": 1.926914833002098e-05, + "loss": 0.6245, + "step": 1918 + }, + { + "epoch": 0.15, + "grad_norm": 1.3837403024335373, + "learning_rate": 1.9268205102932152e-05, + "loss": 0.6319, + "step": 1919 + }, + { + "epoch": 0.15, + "grad_norm": 1.297540562828054, + "learning_rate": 1.9267261290693712e-05, + "loss": 0.6771, + "step": 1920 + }, + { + "epoch": 0.15, + "grad_norm": 1.425844708428341, + "learning_rate": 1.926631689336526e-05, + "loss": 0.6402, + "step": 1921 + }, + { + "epoch": 0.15, + "grad_norm": 1.2936495460911377, + "learning_rate": 1.926537191100641e-05, + "loss": 0.6137, + "step": 1922 + }, + { + "epoch": 0.15, + "grad_norm": 1.2040072587481967, + "learning_rate": 1.9264426343676828e-05, + "loss": 0.6284, + "step": 1923 + }, + { + "epoch": 0.15, + "grad_norm": 1.1990212144223669, + "learning_rate": 1.9263480191436214e-05, + "loss": 0.5934, + "step": 1924 + }, + { + "epoch": 0.15, + "grad_norm": 1.2670900789528925, + "learning_rate": 1.9262533454344303e-05, + "loss": 0.6694, + "step": 1925 + }, + { + "epoch": 0.15, + "grad_norm": 1.2493116867869654, + "learning_rate": 1.9261586132460866e-05, + "loss": 0.5705, + "step": 1926 + }, + { + "epoch": 0.15, + "grad_norm": 1.3729645228209606, + "learning_rate": 1.9260638225845713e-05, + "loss": 0.5774, + "step": 1927 + }, + { + "epoch": 0.15, + "grad_norm": 1.2335169260311833, + "learning_rate": 1.925968973455869e-05, + "loss": 0.5917, + "step": 1928 + }, + { + "epoch": 0.15, + "grad_norm": 1.2142230346274214, + "learning_rate": 1.9258740658659683e-05, + "loss": 0.6177, + "step": 1929 + }, + { + "epoch": 0.15, + "grad_norm": 1.3049324856390505, + "learning_rate": 1.9257790998208606e-05, + "loss": 0.697, + "step": 1930 + }, + { + "epoch": 0.15, + "grad_norm": 1.321748392882289, + "learning_rate": 1.925684075326542e-05, + "loss": 0.6988, + "step": 1931 + }, + { + "epoch": 0.15, + "grad_norm": 1.2409835835307796, + "learning_rate": 1.9255889923890118e-05, + "loss": 0.5907, + "step": 1932 + }, + { + "epoch": 0.15, + "grad_norm": 1.2417434282491842, + "learning_rate": 1.9254938510142735e-05, + "loss": 0.652, + "step": 1933 + }, + { + "epoch": 0.15, + "grad_norm": 1.2861551038103585, + "learning_rate": 1.925398651208333e-05, + "loss": 0.6211, + "step": 1934 + }, + { + "epoch": 0.15, + "grad_norm": 1.4243427701804547, + "learning_rate": 1.925303392977201e-05, + "loss": 0.6335, + "step": 1935 + }, + { + "epoch": 0.15, + "grad_norm": 1.464250286683883, + "learning_rate": 1.9252080763268924e-05, + "loss": 0.7058, + "step": 1936 + }, + { + "epoch": 0.15, + "grad_norm": 1.2647075860172492, + "learning_rate": 1.9251127012634242e-05, + "loss": 0.6536, + "step": 1937 + }, + { + "epoch": 0.15, + "grad_norm": 1.2409793088388401, + "learning_rate": 1.9250172677928184e-05, + "loss": 0.5969, + "step": 1938 + }, + { + "epoch": 0.15, + "grad_norm": 1.3630910046591633, + "learning_rate": 1.9249217759211e-05, + "loss": 0.6866, + "step": 1939 + }, + { + "epoch": 0.15, + "grad_norm": 1.155029529094975, + "learning_rate": 1.924826225654298e-05, + "loss": 0.636, + "step": 1940 + }, + { + "epoch": 0.15, + "grad_norm": 1.228628374757012, + "learning_rate": 1.9247306169984446e-05, + "loss": 0.5917, + "step": 1941 + }, + { + "epoch": 0.15, + "grad_norm": 1.3851029989868728, + "learning_rate": 1.9246349499595767e-05, + "loss": 0.7361, + "step": 1942 + }, + { + "epoch": 0.15, + "grad_norm": 1.1385088062808584, + "learning_rate": 1.9245392245437336e-05, + "loss": 0.618, + "step": 1943 + }, + { + "epoch": 0.15, + "grad_norm": 1.2306579444392938, + "learning_rate": 1.9244434407569596e-05, + "loss": 0.5768, + "step": 1944 + }, + { + "epoch": 0.15, + "grad_norm": 1.2922190541262908, + "learning_rate": 1.9243475986053014e-05, + "loss": 0.6444, + "step": 1945 + }, + { + "epoch": 0.15, + "grad_norm": 1.1461193536916188, + "learning_rate": 1.9242516980948105e-05, + "loss": 0.5994, + "step": 1946 + }, + { + "epoch": 0.15, + "grad_norm": 1.2921450200876514, + "learning_rate": 1.9241557392315413e-05, + "loss": 0.6232, + "step": 1947 + }, + { + "epoch": 0.15, + "grad_norm": 1.3033486537976007, + "learning_rate": 1.9240597220215524e-05, + "loss": 0.5464, + "step": 1948 + }, + { + "epoch": 0.15, + "grad_norm": 1.2579126258946092, + "learning_rate": 1.9239636464709054e-05, + "loss": 0.6244, + "step": 1949 + }, + { + "epoch": 0.15, + "grad_norm": 1.269045458061357, + "learning_rate": 1.9238675125856666e-05, + "loss": 0.6237, + "step": 1950 + }, + { + "epoch": 0.15, + "grad_norm": 1.2859785701455273, + "learning_rate": 1.923771320371905e-05, + "loss": 0.6317, + "step": 1951 + }, + { + "epoch": 0.15, + "grad_norm": 1.1734743013465252, + "learning_rate": 1.9236750698356942e-05, + "loss": 0.6007, + "step": 1952 + }, + { + "epoch": 0.15, + "grad_norm": 1.301467531180983, + "learning_rate": 1.9235787609831105e-05, + "loss": 0.7346, + "step": 1953 + }, + { + "epoch": 0.15, + "grad_norm": 1.2911615563353838, + "learning_rate": 1.9234823938202346e-05, + "loss": 0.6401, + "step": 1954 + }, + { + "epoch": 0.15, + "grad_norm": 1.3560117573833217, + "learning_rate": 1.923385968353151e-05, + "loss": 0.6993, + "step": 1955 + }, + { + "epoch": 0.15, + "grad_norm": 1.2909150192604506, + "learning_rate": 1.9232894845879465e-05, + "loss": 0.5995, + "step": 1956 + }, + { + "epoch": 0.15, + "grad_norm": 1.3388528969166906, + "learning_rate": 1.9231929425307136e-05, + "loss": 0.7416, + "step": 1957 + }, + { + "epoch": 0.15, + "grad_norm": 1.231763854487558, + "learning_rate": 1.9230963421875474e-05, + "loss": 0.6337, + "step": 1958 + }, + { + "epoch": 0.15, + "grad_norm": 1.2729127422882038, + "learning_rate": 1.9229996835645463e-05, + "loss": 0.691, + "step": 1959 + }, + { + "epoch": 0.15, + "grad_norm": 1.305057633029852, + "learning_rate": 1.9229029666678133e-05, + "loss": 0.6141, + "step": 1960 + }, + { + "epoch": 0.15, + "grad_norm": 1.3600797470637918, + "learning_rate": 1.922806191503454e-05, + "loss": 0.6625, + "step": 1961 + }, + { + "epoch": 0.15, + "grad_norm": 1.3920475402420074, + "learning_rate": 1.9227093580775796e-05, + "loss": 0.6639, + "step": 1962 + }, + { + "epoch": 0.15, + "grad_norm": 1.2714037891639316, + "learning_rate": 1.9226124663963023e-05, + "loss": 0.6009, + "step": 1963 + }, + { + "epoch": 0.15, + "grad_norm": 1.1792031613870424, + "learning_rate": 1.92251551646574e-05, + "loss": 0.6305, + "step": 1964 + }, + { + "epoch": 0.15, + "grad_norm": 1.2501618757337087, + "learning_rate": 1.9224185082920138e-05, + "loss": 0.6193, + "step": 1965 + }, + { + "epoch": 0.15, + "grad_norm": 1.1921083976531102, + "learning_rate": 1.922321441881248e-05, + "loss": 0.5905, + "step": 1966 + }, + { + "epoch": 0.15, + "grad_norm": 1.3003995409791924, + "learning_rate": 1.9222243172395706e-05, + "loss": 0.6846, + "step": 1967 + }, + { + "epoch": 0.15, + "grad_norm": 1.3237465435699782, + "learning_rate": 1.9221271343731146e-05, + "loss": 0.6144, + "step": 1968 + }, + { + "epoch": 0.15, + "grad_norm": 1.2862052926022414, + "learning_rate": 1.922029893288015e-05, + "loss": 0.5887, + "step": 1969 + }, + { + "epoch": 0.15, + "grad_norm": 1.2031862813031797, + "learning_rate": 1.921932593990411e-05, + "loss": 0.5889, + "step": 1970 + }, + { + "epoch": 0.15, + "grad_norm": 1.4197782210312542, + "learning_rate": 1.9218352364864457e-05, + "loss": 0.7039, + "step": 1971 + }, + { + "epoch": 0.15, + "grad_norm": 1.2854706168123515, + "learning_rate": 1.921737820782266e-05, + "loss": 0.6811, + "step": 1972 + }, + { + "epoch": 0.15, + "grad_norm": 1.214310507561309, + "learning_rate": 1.921640346884022e-05, + "loss": 0.5814, + "step": 1973 + }, + { + "epoch": 0.15, + "grad_norm": 1.3339426416708104, + "learning_rate": 1.9215428147978684e-05, + "loss": 0.7675, + "step": 1974 + }, + { + "epoch": 0.15, + "grad_norm": 1.2380011208580008, + "learning_rate": 1.9214452245299616e-05, + "loss": 0.609, + "step": 1975 + }, + { + "epoch": 0.15, + "grad_norm": 1.35708987340541, + "learning_rate": 1.9213475760864645e-05, + "loss": 0.6503, + "step": 1976 + }, + { + "epoch": 0.15, + "grad_norm": 1.1958519210877196, + "learning_rate": 1.9212498694735408e-05, + "loss": 0.5885, + "step": 1977 + }, + { + "epoch": 0.15, + "grad_norm": 1.1480532671635908, + "learning_rate": 1.92115210469736e-05, + "loss": 0.6093, + "step": 1978 + }, + { + "epoch": 0.15, + "grad_norm": 1.2542909406428344, + "learning_rate": 1.9210542817640945e-05, + "loss": 0.6319, + "step": 1979 + }, + { + "epoch": 0.15, + "grad_norm": 1.3220536528360614, + "learning_rate": 1.92095640067992e-05, + "loss": 0.6659, + "step": 1980 + }, + { + "epoch": 0.15, + "grad_norm": 1.4459513309197338, + "learning_rate": 1.9208584614510163e-05, + "loss": 0.6722, + "step": 1981 + }, + { + "epoch": 0.15, + "grad_norm": 1.264629349141053, + "learning_rate": 1.9207604640835675e-05, + "loss": 0.623, + "step": 1982 + }, + { + "epoch": 0.15, + "grad_norm": 1.223744989293315, + "learning_rate": 1.9206624085837593e-05, + "loss": 0.6286, + "step": 1983 + }, + { + "epoch": 0.15, + "grad_norm": 1.2420079803302555, + "learning_rate": 1.9205642949577835e-05, + "loss": 0.6352, + "step": 1984 + }, + { + "epoch": 0.15, + "grad_norm": 1.3823857484070743, + "learning_rate": 1.9204661232118343e-05, + "loss": 0.6804, + "step": 1985 + }, + { + "epoch": 0.15, + "grad_norm": 1.2243581997944102, + "learning_rate": 1.92036789335211e-05, + "loss": 0.5816, + "step": 1986 + }, + { + "epoch": 0.15, + "grad_norm": 1.3132825062851852, + "learning_rate": 1.920269605384812e-05, + "loss": 0.6605, + "step": 1987 + }, + { + "epoch": 0.15, + "grad_norm": 1.3311862894492688, + "learning_rate": 1.9201712593161458e-05, + "loss": 0.641, + "step": 1988 + }, + { + "epoch": 0.15, + "grad_norm": 1.3230289301459177, + "learning_rate": 1.9200728551523204e-05, + "loss": 0.6296, + "step": 1989 + }, + { + "epoch": 0.15, + "grad_norm": 1.2198748654931182, + "learning_rate": 1.919974392899549e-05, + "loss": 0.5966, + "step": 1990 + }, + { + "epoch": 0.15, + "grad_norm": 1.2078477388006041, + "learning_rate": 1.9198758725640476e-05, + "loss": 0.6012, + "step": 1991 + }, + { + "epoch": 0.15, + "grad_norm": 1.3875581162112265, + "learning_rate": 1.9197772941520365e-05, + "loss": 0.6671, + "step": 1992 + }, + { + "epoch": 0.15, + "grad_norm": 1.3004054995986967, + "learning_rate": 1.9196786576697392e-05, + "loss": 0.6817, + "step": 1993 + }, + { + "epoch": 0.15, + "grad_norm": 1.3269488793036517, + "learning_rate": 1.9195799631233835e-05, + "loss": 0.6484, + "step": 1994 + }, + { + "epoch": 0.15, + "grad_norm": 1.4012339246169776, + "learning_rate": 1.9194812105192003e-05, + "loss": 0.7139, + "step": 1995 + }, + { + "epoch": 0.15, + "grad_norm": 1.1605691913422316, + "learning_rate": 1.9193823998634242e-05, + "loss": 0.5975, + "step": 1996 + }, + { + "epoch": 0.15, + "grad_norm": 1.467215690301346, + "learning_rate": 1.919283531162294e-05, + "loss": 0.7013, + "step": 1997 + }, + { + "epoch": 0.16, + "grad_norm": 1.324162844937865, + "learning_rate": 1.9191846044220514e-05, + "loss": 0.6951, + "step": 1998 + }, + { + "epoch": 0.16, + "grad_norm": 1.2823524384258291, + "learning_rate": 1.9190856196489424e-05, + "loss": 0.6955, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 1.2663233208022417, + "learning_rate": 1.9189865768492168e-05, + "loss": 0.5796, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 1.196024065834504, + "learning_rate": 1.918887476029127e-05, + "loss": 0.5946, + "step": 2001 + }, + { + "epoch": 0.16, + "grad_norm": 1.3400009997207556, + "learning_rate": 1.9187883171949298e-05, + "loss": 0.6037, + "step": 2002 + }, + { + "epoch": 0.16, + "grad_norm": 1.33373118465062, + "learning_rate": 1.9186891003528857e-05, + "loss": 0.6111, + "step": 2003 + }, + { + "epoch": 0.16, + "grad_norm": 1.121303684901418, + "learning_rate": 1.918589825509259e-05, + "loss": 0.5529, + "step": 2004 + }, + { + "epoch": 0.16, + "grad_norm": 1.2073844713818416, + "learning_rate": 1.9184904926703176e-05, + "loss": 0.638, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 1.3220809739976749, + "learning_rate": 1.9183911018423324e-05, + "loss": 0.6488, + "step": 2006 + }, + { + "epoch": 0.16, + "grad_norm": 1.3576937835383587, + "learning_rate": 1.9182916530315788e-05, + "loss": 0.64, + "step": 2007 + }, + { + "epoch": 0.16, + "grad_norm": 1.1506137599116333, + "learning_rate": 1.9181921462443354e-05, + "loss": 0.6083, + "step": 2008 + }, + { + "epoch": 0.16, + "grad_norm": 1.170778397236292, + "learning_rate": 1.9180925814868843e-05, + "loss": 0.5477, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 1.3602206789518763, + "learning_rate": 1.917992958765512e-05, + "loss": 0.6149, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 1.1666669959113247, + "learning_rate": 1.917893278086508e-05, + "loss": 0.5715, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 1.4121472787584541, + "learning_rate": 1.9177935394561652e-05, + "loss": 0.6676, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 1.2042105100277287, + "learning_rate": 1.9176937428807818e-05, + "loss": 0.5866, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 1.2997939570075012, + "learning_rate": 1.9175938883666574e-05, + "loss": 0.634, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 1.219849457364017, + "learning_rate": 1.9174939759200965e-05, + "loss": 0.6235, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 1.2043239020062682, + "learning_rate": 1.9173940055474074e-05, + "loss": 0.6389, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 1.3267832991276054, + "learning_rate": 1.9172939772549014e-05, + "loss": 0.634, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 1.3324048365809513, + "learning_rate": 1.9171938910488945e-05, + "loss": 0.6531, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 1.358209746341622, + "learning_rate": 1.917093746935705e-05, + "loss": 0.6804, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 1.3244731287406715, + "learning_rate": 1.916993544921655e-05, + "loss": 0.6695, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 1.4140043299391518, + "learning_rate": 1.9168932850130723e-05, + "loss": 0.6657, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 1.2142736931213822, + "learning_rate": 1.9167929672162856e-05, + "loss": 0.604, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 1.2253866208381643, + "learning_rate": 1.9166925915376288e-05, + "loss": 0.6437, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 1.3530651405256198, + "learning_rate": 1.9165921579834395e-05, + "loss": 0.6696, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 1.2925642596711908, + "learning_rate": 1.916491666560058e-05, + "loss": 0.6431, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 1.5131163925310542, + "learning_rate": 1.916391117273829e-05, + "loss": 0.6546, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 1.270793674454692, + "learning_rate": 1.916290510131101e-05, + "loss": 0.6366, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 1.2982945314141887, + "learning_rate": 1.9161898451382257e-05, + "loss": 0.6531, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 1.3241880970335667, + "learning_rate": 1.9160891223015586e-05, + "loss": 0.6709, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 1.2966803439104615, + "learning_rate": 1.9159883416274585e-05, + "loss": 0.6174, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 1.174956326991961, + "learning_rate": 1.915887503122289e-05, + "loss": 0.604, + "step": 2031 + }, + { + "epoch": 0.16, + "grad_norm": 1.3852026590059048, + "learning_rate": 1.9157866067924157e-05, + "loss": 0.6698, + "step": 2032 + }, + { + "epoch": 0.16, + "grad_norm": 1.341433768859375, + "learning_rate": 1.9156856526442092e-05, + "loss": 0.6054, + "step": 2033 + }, + { + "epoch": 0.16, + "grad_norm": 1.3624354583432348, + "learning_rate": 1.915584640684043e-05, + "loss": 0.6264, + "step": 2034 + }, + { + "epoch": 0.16, + "grad_norm": 1.2086475007742867, + "learning_rate": 1.9154835709182947e-05, + "loss": 0.6339, + "step": 2035 + }, + { + "epoch": 0.16, + "grad_norm": 1.3800078576320352, + "learning_rate": 1.9153824433533453e-05, + "loss": 0.622, + "step": 2036 + }, + { + "epoch": 0.16, + "grad_norm": 1.2407871246102744, + "learning_rate": 1.9152812579955795e-05, + "loss": 0.6017, + "step": 2037 + }, + { + "epoch": 0.16, + "grad_norm": 1.3450462831099477, + "learning_rate": 1.915180014851386e-05, + "loss": 0.6193, + "step": 2038 + }, + { + "epoch": 0.16, + "grad_norm": 1.3559467890857104, + "learning_rate": 1.915078713927156e-05, + "loss": 0.6751, + "step": 2039 + }, + { + "epoch": 0.16, + "grad_norm": 1.2963046922613652, + "learning_rate": 1.9149773552292855e-05, + "loss": 0.631, + "step": 2040 + }, + { + "epoch": 0.16, + "grad_norm": 1.2528847309470341, + "learning_rate": 1.9148759387641745e-05, + "loss": 0.554, + "step": 2041 + }, + { + "epoch": 0.16, + "grad_norm": 1.1992937493247826, + "learning_rate": 1.914774464538225e-05, + "loss": 0.5803, + "step": 2042 + }, + { + "epoch": 0.16, + "grad_norm": 1.3610252360849864, + "learning_rate": 1.914672932557844e-05, + "loss": 0.6453, + "step": 2043 + }, + { + "epoch": 0.16, + "grad_norm": 1.4820835295340933, + "learning_rate": 1.9145713428294415e-05, + "loss": 0.7304, + "step": 2044 + }, + { + "epoch": 0.16, + "grad_norm": 1.37786458467916, + "learning_rate": 1.9144696953594316e-05, + "loss": 0.7114, + "step": 2045 + }, + { + "epoch": 0.16, + "grad_norm": 1.207204565549222, + "learning_rate": 1.9143679901542316e-05, + "loss": 0.6328, + "step": 2046 + }, + { + "epoch": 0.16, + "grad_norm": 1.4232089314528413, + "learning_rate": 1.914266227220263e-05, + "loss": 0.6826, + "step": 2047 + }, + { + "epoch": 0.16, + "grad_norm": 1.3197123952581942, + "learning_rate": 1.9141644065639507e-05, + "loss": 0.655, + "step": 2048 + }, + { + "epoch": 0.16, + "grad_norm": 1.2843356309783234, + "learning_rate": 1.914062528191723e-05, + "loss": 0.6741, + "step": 2049 + }, + { + "epoch": 0.16, + "grad_norm": 1.209933646250681, + "learning_rate": 1.9139605921100116e-05, + "loss": 0.6104, + "step": 2050 + }, + { + "epoch": 0.16, + "grad_norm": 1.2265959814082021, + "learning_rate": 1.9138585983252527e-05, + "loss": 0.6151, + "step": 2051 + }, + { + "epoch": 0.16, + "grad_norm": 1.243258128836326, + "learning_rate": 1.913756546843886e-05, + "loss": 0.5764, + "step": 2052 + }, + { + "epoch": 0.16, + "grad_norm": 1.288175104046891, + "learning_rate": 1.9136544376723537e-05, + "loss": 0.6096, + "step": 2053 + }, + { + "epoch": 0.16, + "grad_norm": 1.2952533832675945, + "learning_rate": 1.9135522708171034e-05, + "loss": 0.6438, + "step": 2054 + }, + { + "epoch": 0.16, + "grad_norm": 1.2134435010943878, + "learning_rate": 1.9134500462845844e-05, + "loss": 0.5902, + "step": 2055 + }, + { + "epoch": 0.16, + "grad_norm": 1.2717059948707234, + "learning_rate": 1.9133477640812513e-05, + "loss": 0.5554, + "step": 2056 + }, + { + "epoch": 0.16, + "grad_norm": 1.3004287379538864, + "learning_rate": 1.9132454242135618e-05, + "loss": 0.6017, + "step": 2057 + }, + { + "epoch": 0.16, + "grad_norm": 1.2478443154497603, + "learning_rate": 1.9131430266879766e-05, + "loss": 0.5813, + "step": 2058 + }, + { + "epoch": 0.16, + "grad_norm": 1.451664180089235, + "learning_rate": 1.913040571510961e-05, + "loss": 0.6311, + "step": 2059 + }, + { + "epoch": 0.16, + "grad_norm": 1.2436437647702974, + "learning_rate": 1.9129380586889836e-05, + "loss": 0.585, + "step": 2060 + }, + { + "epoch": 0.16, + "grad_norm": 1.231593365886224, + "learning_rate": 1.9128354882285166e-05, + "loss": 0.6128, + "step": 2061 + }, + { + "epoch": 0.16, + "grad_norm": 1.1896508965398727, + "learning_rate": 1.9127328601360354e-05, + "loss": 0.5924, + "step": 2062 + }, + { + "epoch": 0.16, + "grad_norm": 1.3441368255960633, + "learning_rate": 1.91263017441802e-05, + "loss": 0.67, + "step": 2063 + }, + { + "epoch": 0.16, + "grad_norm": 1.273881799619276, + "learning_rate": 1.9125274310809524e-05, + "loss": 0.6737, + "step": 2064 + }, + { + "epoch": 0.16, + "grad_norm": 1.2376626572493064, + "learning_rate": 1.9124246301313206e-05, + "loss": 0.584, + "step": 2065 + }, + { + "epoch": 0.16, + "grad_norm": 1.366355555680691, + "learning_rate": 1.9123217715756142e-05, + "loss": 0.6793, + "step": 2066 + }, + { + "epoch": 0.16, + "grad_norm": 1.3335146879997093, + "learning_rate": 1.9122188554203275e-05, + "loss": 0.6198, + "step": 2067 + }, + { + "epoch": 0.16, + "grad_norm": 1.3164377180464333, + "learning_rate": 1.9121158816719577e-05, + "loss": 0.6492, + "step": 2068 + }, + { + "epoch": 0.16, + "grad_norm": 1.3035112663838624, + "learning_rate": 1.912012850337007e-05, + "loss": 0.5759, + "step": 2069 + }, + { + "epoch": 0.16, + "grad_norm": 1.4558794720825778, + "learning_rate": 1.9119097614219795e-05, + "loss": 0.6725, + "step": 2070 + }, + { + "epoch": 0.16, + "grad_norm": 1.202374707394164, + "learning_rate": 1.911806614933384e-05, + "loss": 0.6078, + "step": 2071 + }, + { + "epoch": 0.16, + "grad_norm": 1.217059111771495, + "learning_rate": 1.9117034108777323e-05, + "loss": 0.6125, + "step": 2072 + }, + { + "epoch": 0.16, + "grad_norm": 1.2498669553525101, + "learning_rate": 1.9116001492615403e-05, + "loss": 0.6607, + "step": 2073 + }, + { + "epoch": 0.16, + "grad_norm": 1.425125932984774, + "learning_rate": 1.911496830091328e-05, + "loss": 0.7054, + "step": 2074 + }, + { + "epoch": 0.16, + "grad_norm": 1.273080834365659, + "learning_rate": 1.911393453373618e-05, + "loss": 0.6738, + "step": 2075 + }, + { + "epoch": 0.16, + "grad_norm": 1.3463576985158532, + "learning_rate": 1.9112900191149374e-05, + "loss": 0.6432, + "step": 2076 + }, + { + "epoch": 0.16, + "grad_norm": 1.3587925419468845, + "learning_rate": 1.911186527321816e-05, + "loss": 0.6974, + "step": 2077 + }, + { + "epoch": 0.16, + "grad_norm": 1.3472790618695383, + "learning_rate": 1.911082978000788e-05, + "loss": 0.5873, + "step": 2078 + }, + { + "epoch": 0.16, + "grad_norm": 1.2716541089095745, + "learning_rate": 1.910979371158391e-05, + "loss": 0.6436, + "step": 2079 + }, + { + "epoch": 0.16, + "grad_norm": 1.3235672780148966, + "learning_rate": 1.910875706801166e-05, + "loss": 0.601, + "step": 2080 + }, + { + "epoch": 0.16, + "grad_norm": 1.3177046895133842, + "learning_rate": 1.9107719849356588e-05, + "loss": 0.6769, + "step": 2081 + }, + { + "epoch": 0.16, + "grad_norm": 1.272032635641989, + "learning_rate": 1.9106682055684168e-05, + "loss": 0.6449, + "step": 2082 + }, + { + "epoch": 0.16, + "grad_norm": 1.250553247089158, + "learning_rate": 1.9105643687059926e-05, + "loss": 0.6294, + "step": 2083 + }, + { + "epoch": 0.16, + "grad_norm": 1.1390032801258525, + "learning_rate": 1.9104604743549422e-05, + "loss": 0.5746, + "step": 2084 + }, + { + "epoch": 0.16, + "grad_norm": 1.1694509739868435, + "learning_rate": 1.9103565225218243e-05, + "loss": 0.6365, + "step": 2085 + }, + { + "epoch": 0.16, + "grad_norm": 1.2251363834054254, + "learning_rate": 1.9102525132132028e-05, + "loss": 0.6455, + "step": 2086 + }, + { + "epoch": 0.16, + "grad_norm": 1.3180199758436981, + "learning_rate": 1.910148446435643e-05, + "loss": 0.6392, + "step": 2087 + }, + { + "epoch": 0.16, + "grad_norm": 1.3194633778111564, + "learning_rate": 1.910044322195717e-05, + "loss": 0.562, + "step": 2088 + }, + { + "epoch": 0.16, + "grad_norm": 1.288310299667858, + "learning_rate": 1.9099401404999972e-05, + "loss": 0.6323, + "step": 2089 + }, + { + "epoch": 0.16, + "grad_norm": 1.3034362732235119, + "learning_rate": 1.9098359013550617e-05, + "loss": 0.6591, + "step": 2090 + }, + { + "epoch": 0.16, + "grad_norm": 1.2375088758824635, + "learning_rate": 1.9097316047674915e-05, + "loss": 0.6036, + "step": 2091 + }, + { + "epoch": 0.16, + "grad_norm": 1.2155822275673247, + "learning_rate": 1.9096272507438715e-05, + "loss": 0.6065, + "step": 2092 + }, + { + "epoch": 0.16, + "grad_norm": 1.3515791699037465, + "learning_rate": 1.9095228392907904e-05, + "loss": 0.6858, + "step": 2093 + }, + { + "epoch": 0.16, + "grad_norm": 1.2961803552206335, + "learning_rate": 1.90941837041484e-05, + "loss": 0.669, + "step": 2094 + }, + { + "epoch": 0.16, + "grad_norm": 1.2780059591133508, + "learning_rate": 1.9093138441226156e-05, + "loss": 0.5898, + "step": 2095 + }, + { + "epoch": 0.16, + "grad_norm": 1.8588883861136467, + "learning_rate": 1.9092092604207166e-05, + "loss": 0.7157, + "step": 2096 + }, + { + "epoch": 0.16, + "grad_norm": 1.175396675761132, + "learning_rate": 1.9091046193157464e-05, + "loss": 0.6107, + "step": 2097 + }, + { + "epoch": 0.16, + "grad_norm": 1.3872467325364235, + "learning_rate": 1.9089999208143113e-05, + "loss": 0.6484, + "step": 2098 + }, + { + "epoch": 0.16, + "grad_norm": 1.3398876127397998, + "learning_rate": 1.908895164923021e-05, + "loss": 0.6737, + "step": 2099 + }, + { + "epoch": 0.16, + "grad_norm": 1.3554141338679262, + "learning_rate": 1.9087903516484898e-05, + "loss": 0.6957, + "step": 2100 + }, + { + "epoch": 0.16, + "grad_norm": 1.2921149901188436, + "learning_rate": 1.908685480997335e-05, + "loss": 0.6873, + "step": 2101 + }, + { + "epoch": 0.16, + "grad_norm": 1.6798620088990228, + "learning_rate": 1.9085805529761778e-05, + "loss": 0.6263, + "step": 2102 + }, + { + "epoch": 0.16, + "grad_norm": 1.1164184804557442, + "learning_rate": 1.9084755675916423e-05, + "loss": 0.5341, + "step": 2103 + }, + { + "epoch": 0.16, + "grad_norm": 1.306438937965065, + "learning_rate": 1.9083705248503575e-05, + "loss": 0.6658, + "step": 2104 + }, + { + "epoch": 0.16, + "grad_norm": 1.1967670840224751, + "learning_rate": 1.9082654247589543e-05, + "loss": 0.6296, + "step": 2105 + }, + { + "epoch": 0.16, + "grad_norm": 1.3573343591814218, + "learning_rate": 1.9081602673240695e-05, + "loss": 0.6393, + "step": 2106 + }, + { + "epoch": 0.16, + "grad_norm": 1.2865097663677707, + "learning_rate": 1.9080550525523413e-05, + "loss": 0.6403, + "step": 2107 + }, + { + "epoch": 0.16, + "grad_norm": 1.1711245868025382, + "learning_rate": 1.9079497804504122e-05, + "loss": 0.5849, + "step": 2108 + }, + { + "epoch": 0.16, + "grad_norm": 1.3722672616377143, + "learning_rate": 1.9078444510249294e-05, + "loss": 0.6448, + "step": 2109 + }, + { + "epoch": 0.16, + "grad_norm": 1.3890895725010728, + "learning_rate": 1.9077390642825427e-05, + "loss": 0.692, + "step": 2110 + }, + { + "epoch": 0.16, + "grad_norm": 1.2972426295575574, + "learning_rate": 1.9076336202299055e-05, + "loss": 0.6433, + "step": 2111 + }, + { + "epoch": 0.16, + "grad_norm": 1.245771073324085, + "learning_rate": 1.907528118873675e-05, + "loss": 0.6449, + "step": 2112 + }, + { + "epoch": 0.16, + "grad_norm": 1.2418793589991117, + "learning_rate": 1.907422560220512e-05, + "loss": 0.6897, + "step": 2113 + }, + { + "epoch": 0.16, + "grad_norm": 1.222195767106704, + "learning_rate": 1.9073169442770814e-05, + "loss": 0.6081, + "step": 2114 + }, + { + "epoch": 0.16, + "grad_norm": 1.2152395309000437, + "learning_rate": 1.907211271050051e-05, + "loss": 0.6626, + "step": 2115 + }, + { + "epoch": 0.16, + "grad_norm": 1.26159558713706, + "learning_rate": 1.907105540546092e-05, + "loss": 0.69, + "step": 2116 + }, + { + "epoch": 0.16, + "grad_norm": 1.2454860245675936, + "learning_rate": 1.9069997527718803e-05, + "loss": 0.6623, + "step": 2117 + }, + { + "epoch": 0.16, + "grad_norm": 1.206306956236214, + "learning_rate": 1.906893907734095e-05, + "loss": 0.6518, + "step": 2118 + }, + { + "epoch": 0.16, + "grad_norm": 1.2104302605255737, + "learning_rate": 1.9067880054394182e-05, + "loss": 0.6932, + "step": 2119 + }, + { + "epoch": 0.16, + "grad_norm": 1.1725419753151858, + "learning_rate": 1.906682045894536e-05, + "loss": 0.6345, + "step": 2120 + }, + { + "epoch": 0.16, + "grad_norm": 1.3535865666017106, + "learning_rate": 1.9065760291061385e-05, + "loss": 0.6849, + "step": 2121 + }, + { + "epoch": 0.16, + "grad_norm": 1.2504353718734038, + "learning_rate": 1.9064699550809193e-05, + "loss": 0.6639, + "step": 2122 + }, + { + "epoch": 0.16, + "grad_norm": 1.2486380309806338, + "learning_rate": 1.9063638238255747e-05, + "loss": 0.6013, + "step": 2123 + }, + { + "epoch": 0.16, + "grad_norm": 1.2627985440780807, + "learning_rate": 1.9062576353468055e-05, + "loss": 0.6623, + "step": 2124 + }, + { + "epoch": 0.16, + "grad_norm": 1.2502061197093575, + "learning_rate": 1.9061513896513166e-05, + "loss": 0.6208, + "step": 2125 + }, + { + "epoch": 0.16, + "grad_norm": 1.5639339733382052, + "learning_rate": 1.9060450867458152e-05, + "loss": 0.7525, + "step": 2126 + }, + { + "epoch": 0.17, + "grad_norm": 1.2891186326260917, + "learning_rate": 1.9059387266370127e-05, + "loss": 0.6546, + "step": 2127 + }, + { + "epoch": 0.17, + "grad_norm": 1.555948484082821, + "learning_rate": 1.9058323093316247e-05, + "loss": 0.6851, + "step": 2128 + }, + { + "epoch": 0.17, + "grad_norm": 1.2131105675052543, + "learning_rate": 1.905725834836369e-05, + "loss": 0.6762, + "step": 2129 + }, + { + "epoch": 0.17, + "grad_norm": 1.3119491147663953, + "learning_rate": 1.905619303157969e-05, + "loss": 0.6108, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 1.3226456652430132, + "learning_rate": 1.9055127143031495e-05, + "loss": 0.6548, + "step": 2131 + }, + { + "epoch": 0.17, + "grad_norm": 1.3480355876051007, + "learning_rate": 1.905406068278641e-05, + "loss": 0.7146, + "step": 2132 + }, + { + "epoch": 0.17, + "grad_norm": 1.3381445427954821, + "learning_rate": 1.9052993650911758e-05, + "loss": 0.6786, + "step": 2133 + }, + { + "epoch": 0.17, + "grad_norm": 1.2937735274382405, + "learning_rate": 1.905192604747491e-05, + "loss": 0.6446, + "step": 2134 + }, + { + "epoch": 0.17, + "grad_norm": 1.2406057207353547, + "learning_rate": 1.905085787254327e-05, + "loss": 0.6253, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 1.2631125295590657, + "learning_rate": 1.9049789126184275e-05, + "loss": 0.6205, + "step": 2136 + }, + { + "epoch": 0.17, + "grad_norm": 1.1535828216543516, + "learning_rate": 1.90487198084654e-05, + "loss": 0.6474, + "step": 2137 + }, + { + "epoch": 0.17, + "grad_norm": 1.3129414315623575, + "learning_rate": 1.904764991945416e-05, + "loss": 0.6226, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 1.1583938827515148, + "learning_rate": 1.9046579459218103e-05, + "loss": 0.6406, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 1.2867256483994411, + "learning_rate": 1.904550842782481e-05, + "loss": 0.6571, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 1.2765504199125304, + "learning_rate": 1.90444368253419e-05, + "loss": 0.64, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 1.2056213836061969, + "learning_rate": 1.9043364651837026e-05, + "loss": 0.6632, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 1.2662644360426416, + "learning_rate": 1.9042291907377886e-05, + "loss": 0.674, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 1.2909196826644382, + "learning_rate": 1.904121859203221e-05, + "loss": 0.6332, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 1.1797502008410858, + "learning_rate": 1.9040144705867755e-05, + "loss": 0.5879, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 1.0404876457958412, + "learning_rate": 1.9039070248952324e-05, + "loss": 0.6045, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 1.2474719232063831, + "learning_rate": 1.9037995221353754e-05, + "loss": 0.6385, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 1.285709853202506, + "learning_rate": 1.9036919623139916e-05, + "loss": 0.611, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 1.22290587390684, + "learning_rate": 1.9035843454378715e-05, + "loss": 0.5861, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 1.3315507128287138, + "learning_rate": 1.90347667151381e-05, + "loss": 0.5779, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 1.3597209534050894, + "learning_rate": 1.903368940548605e-05, + "loss": 0.6834, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 1.1954379047108512, + "learning_rate": 1.9032611525490575e-05, + "loss": 0.6113, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 1.270987371062318, + "learning_rate": 1.9031533075219737e-05, + "loss": 0.6587, + "step": 2153 + }, + { + "epoch": 0.17, + "grad_norm": 1.2807978320656799, + "learning_rate": 1.903045405474162e-05, + "loss": 0.582, + "step": 2154 + }, + { + "epoch": 0.17, + "grad_norm": 1.2917413792485852, + "learning_rate": 1.9029374464124344e-05, + "loss": 0.651, + "step": 2155 + }, + { + "epoch": 0.17, + "grad_norm": 1.3435415061839777, + "learning_rate": 1.902829430343607e-05, + "loss": 0.6278, + "step": 2156 + }, + { + "epoch": 0.17, + "grad_norm": 1.1928243733335564, + "learning_rate": 1.9027213572745006e-05, + "loss": 0.59, + "step": 2157 + }, + { + "epoch": 0.17, + "grad_norm": 1.393836738042806, + "learning_rate": 1.9026132272119368e-05, + "loss": 0.6737, + "step": 2158 + }, + { + "epoch": 0.17, + "grad_norm": 1.3304006938809796, + "learning_rate": 1.9025050401627433e-05, + "loss": 0.6413, + "step": 2159 + }, + { + "epoch": 0.17, + "grad_norm": 1.3030657711336548, + "learning_rate": 1.90239679613375e-05, + "loss": 0.6559, + "step": 2160 + }, + { + "epoch": 0.17, + "grad_norm": 1.2609678697296773, + "learning_rate": 1.9022884951317916e-05, + "loss": 0.5928, + "step": 2161 + }, + { + "epoch": 0.17, + "grad_norm": 1.2555601436864992, + "learning_rate": 1.9021801371637055e-05, + "loss": 0.6179, + "step": 2162 + }, + { + "epoch": 0.17, + "grad_norm": 1.2895793918843141, + "learning_rate": 1.9020717222363322e-05, + "loss": 0.6373, + "step": 2163 + }, + { + "epoch": 0.17, + "grad_norm": 1.2674462671571585, + "learning_rate": 1.9019632503565174e-05, + "loss": 0.583, + "step": 2164 + }, + { + "epoch": 0.17, + "grad_norm": 1.25094044593771, + "learning_rate": 1.9018547215311086e-05, + "loss": 0.6169, + "step": 2165 + }, + { + "epoch": 0.17, + "grad_norm": 1.1661626317374814, + "learning_rate": 1.9017461357669588e-05, + "loss": 0.6119, + "step": 2166 + }, + { + "epoch": 0.17, + "grad_norm": 1.3096499607292253, + "learning_rate": 1.9016374930709223e-05, + "loss": 0.6436, + "step": 2167 + }, + { + "epoch": 0.17, + "grad_norm": 1.2719579889839332, + "learning_rate": 1.9015287934498598e-05, + "loss": 0.6431, + "step": 2168 + }, + { + "epoch": 0.17, + "grad_norm": 1.3271048274113608, + "learning_rate": 1.901420036910633e-05, + "loss": 0.6619, + "step": 2169 + }, + { + "epoch": 0.17, + "grad_norm": 1.2545936102790916, + "learning_rate": 1.9013112234601084e-05, + "loss": 0.6566, + "step": 2170 + }, + { + "epoch": 0.17, + "grad_norm": 1.3744451964086848, + "learning_rate": 1.9012023531051565e-05, + "loss": 0.6734, + "step": 2171 + }, + { + "epoch": 0.17, + "grad_norm": 1.2590793366678148, + "learning_rate": 1.90109342585265e-05, + "loss": 0.6081, + "step": 2172 + }, + { + "epoch": 0.17, + "grad_norm": 1.2601475809346592, + "learning_rate": 1.900984441709467e-05, + "loss": 0.6688, + "step": 2173 + }, + { + "epoch": 0.17, + "grad_norm": 1.1182569173713621, + "learning_rate": 1.9008754006824874e-05, + "loss": 0.5623, + "step": 2174 + }, + { + "epoch": 0.17, + "grad_norm": 1.3822473351681779, + "learning_rate": 1.900766302778596e-05, + "loss": 0.6451, + "step": 2175 + }, + { + "epoch": 0.17, + "grad_norm": 1.2689384132633361, + "learning_rate": 1.9006571480046805e-05, + "loss": 0.61, + "step": 2176 + }, + { + "epoch": 0.17, + "grad_norm": 1.3104545003567227, + "learning_rate": 1.9005479363676322e-05, + "loss": 0.6262, + "step": 2177 + }, + { + "epoch": 0.17, + "grad_norm": 1.3082689024414194, + "learning_rate": 1.9004386678743468e-05, + "loss": 0.6298, + "step": 2178 + }, + { + "epoch": 0.17, + "grad_norm": 1.3352777737155743, + "learning_rate": 1.9003293425317224e-05, + "loss": 0.6366, + "step": 2179 + }, + { + "epoch": 0.17, + "grad_norm": 1.3554340544995567, + "learning_rate": 1.9002199603466617e-05, + "loss": 0.6191, + "step": 2180 + }, + { + "epoch": 0.17, + "grad_norm": 1.2130912577964308, + "learning_rate": 1.90011052132607e-05, + "loss": 0.6114, + "step": 2181 + }, + { + "epoch": 0.17, + "grad_norm": 1.311628006596029, + "learning_rate": 1.9000010254768576e-05, + "loss": 0.6532, + "step": 2182 + }, + { + "epoch": 0.17, + "grad_norm": 1.1812256644027996, + "learning_rate": 1.899891472805937e-05, + "loss": 0.5825, + "step": 2183 + }, + { + "epoch": 0.17, + "grad_norm": 1.2512998498214598, + "learning_rate": 1.899781863320225e-05, + "loss": 0.6545, + "step": 2184 + }, + { + "epoch": 0.17, + "grad_norm": 1.2153521881332408, + "learning_rate": 1.8996721970266417e-05, + "loss": 0.6306, + "step": 2185 + }, + { + "epoch": 0.17, + "grad_norm": 1.2840863444123651, + "learning_rate": 1.8995624739321104e-05, + "loss": 0.6233, + "step": 2186 + }, + { + "epoch": 0.17, + "grad_norm": 1.2849655241738704, + "learning_rate": 1.8994526940435593e-05, + "loss": 0.6324, + "step": 2187 + }, + { + "epoch": 0.17, + "grad_norm": 1.2846121984343022, + "learning_rate": 1.8993428573679193e-05, + "loss": 0.6277, + "step": 2188 + }, + { + "epoch": 0.17, + "grad_norm": 1.3546217764956616, + "learning_rate": 1.8992329639121243e-05, + "loss": 0.5932, + "step": 2189 + }, + { + "epoch": 0.17, + "grad_norm": 1.221647974177459, + "learning_rate": 1.8991230136831133e-05, + "loss": 0.6211, + "step": 2190 + }, + { + "epoch": 0.17, + "grad_norm": 1.2044068480845211, + "learning_rate": 1.899013006687827e-05, + "loss": 0.6546, + "step": 2191 + }, + { + "epoch": 0.17, + "grad_norm": 1.2214997888659351, + "learning_rate": 1.8989029429332117e-05, + "loss": 0.6488, + "step": 2192 + }, + { + "epoch": 0.17, + "grad_norm": 1.217256805053041, + "learning_rate": 1.898792822426216e-05, + "loss": 0.6382, + "step": 2193 + }, + { + "epoch": 0.17, + "grad_norm": 1.3706500430413289, + "learning_rate": 1.8986826451737918e-05, + "loss": 0.6574, + "step": 2194 + }, + { + "epoch": 0.17, + "grad_norm": 1.3003808399037404, + "learning_rate": 1.898572411182896e-05, + "loss": 0.7057, + "step": 2195 + }, + { + "epoch": 0.17, + "grad_norm": 1.3763865069357932, + "learning_rate": 1.898462120460488e-05, + "loss": 0.6532, + "step": 2196 + }, + { + "epoch": 0.17, + "grad_norm": 1.3425234586204093, + "learning_rate": 1.8983517730135304e-05, + "loss": 0.5741, + "step": 2197 + }, + { + "epoch": 0.17, + "grad_norm": 1.2425613320639064, + "learning_rate": 1.8982413688489906e-05, + "loss": 0.6441, + "step": 2198 + }, + { + "epoch": 0.17, + "grad_norm": 1.2721903492036613, + "learning_rate": 1.898130907973839e-05, + "loss": 0.6709, + "step": 2199 + }, + { + "epoch": 0.17, + "grad_norm": 1.2513695843207595, + "learning_rate": 1.8980203903950495e-05, + "loss": 0.6058, + "step": 2200 + }, + { + "epoch": 0.17, + "grad_norm": 1.3495815953267751, + "learning_rate": 1.8979098161195995e-05, + "loss": 0.7001, + "step": 2201 + }, + { + "epoch": 0.17, + "grad_norm": 1.237918981273691, + "learning_rate": 1.89779918515447e-05, + "loss": 0.5909, + "step": 2202 + }, + { + "epoch": 0.17, + "grad_norm": 1.236287241375756, + "learning_rate": 1.8976884975066464e-05, + "loss": 0.5965, + "step": 2203 + }, + { + "epoch": 0.17, + "grad_norm": 1.272863996116338, + "learning_rate": 1.897577753183116e-05, + "loss": 0.6082, + "step": 2204 + }, + { + "epoch": 0.17, + "grad_norm": 1.2897863494282011, + "learning_rate": 1.8974669521908714e-05, + "loss": 0.6891, + "step": 2205 + }, + { + "epoch": 0.17, + "grad_norm": 1.1921505963574597, + "learning_rate": 1.8973560945369076e-05, + "loss": 0.6242, + "step": 2206 + }, + { + "epoch": 0.17, + "grad_norm": 1.1876158908711067, + "learning_rate": 1.897245180228224e-05, + "loss": 0.6502, + "step": 2207 + }, + { + "epoch": 0.17, + "grad_norm": 1.2537498495419412, + "learning_rate": 1.897134209271823e-05, + "loss": 0.6333, + "step": 2208 + }, + { + "epoch": 0.17, + "grad_norm": 1.1121322859938865, + "learning_rate": 1.8970231816747103e-05, + "loss": 0.5998, + "step": 2209 + }, + { + "epoch": 0.17, + "grad_norm": 1.2086086398035403, + "learning_rate": 1.8969120974438967e-05, + "loss": 0.5792, + "step": 2210 + }, + { + "epoch": 0.17, + "grad_norm": 1.405265208969785, + "learning_rate": 1.896800956586395e-05, + "loss": 0.7066, + "step": 2211 + }, + { + "epoch": 0.17, + "grad_norm": 1.2142595069926199, + "learning_rate": 1.8966897591092213e-05, + "loss": 0.6043, + "step": 2212 + }, + { + "epoch": 0.17, + "grad_norm": 1.3413202808094546, + "learning_rate": 1.8965785050193976e-05, + "loss": 0.6512, + "step": 2213 + }, + { + "epoch": 0.17, + "grad_norm": 1.207949144300272, + "learning_rate": 1.8964671943239467e-05, + "loss": 0.6276, + "step": 2214 + }, + { + "epoch": 0.17, + "grad_norm": 1.1663273533623015, + "learning_rate": 1.8963558270298965e-05, + "loss": 0.6386, + "step": 2215 + }, + { + "epoch": 0.17, + "grad_norm": 1.1964878960239114, + "learning_rate": 1.8962444031442788e-05, + "loss": 0.6284, + "step": 2216 + }, + { + "epoch": 0.17, + "grad_norm": 1.3188564691457447, + "learning_rate": 1.8961329226741277e-05, + "loss": 0.6064, + "step": 2217 + }, + { + "epoch": 0.17, + "grad_norm": 1.2804359082705383, + "learning_rate": 1.8960213856264818e-05, + "loss": 0.6543, + "step": 2218 + }, + { + "epoch": 0.17, + "grad_norm": 1.2107124365572628, + "learning_rate": 1.8959097920083828e-05, + "loss": 0.5967, + "step": 2219 + }, + { + "epoch": 0.17, + "grad_norm": 1.4385573603358472, + "learning_rate": 1.8957981418268764e-05, + "loss": 0.6438, + "step": 2220 + }, + { + "epoch": 0.17, + "grad_norm": 1.1374573814875977, + "learning_rate": 1.8956864350890117e-05, + "loss": 0.5845, + "step": 2221 + }, + { + "epoch": 0.17, + "grad_norm": 1.428789617701718, + "learning_rate": 1.8955746718018413e-05, + "loss": 0.657, + "step": 2222 + }, + { + "epoch": 0.17, + "grad_norm": 1.2356568453075751, + "learning_rate": 1.895462851972421e-05, + "loss": 0.6111, + "step": 2223 + }, + { + "epoch": 0.17, + "grad_norm": 1.1890877599451777, + "learning_rate": 1.895350975607811e-05, + "loss": 0.5742, + "step": 2224 + }, + { + "epoch": 0.17, + "grad_norm": 1.2546147041533051, + "learning_rate": 1.8952390427150747e-05, + "loss": 0.6602, + "step": 2225 + }, + { + "epoch": 0.17, + "grad_norm": 1.269079227670734, + "learning_rate": 1.8951270533012786e-05, + "loss": 0.6248, + "step": 2226 + }, + { + "epoch": 0.17, + "grad_norm": 1.3069233723850515, + "learning_rate": 1.8950150073734937e-05, + "loss": 0.6711, + "step": 2227 + }, + { + "epoch": 0.17, + "grad_norm": 1.3370850302563713, + "learning_rate": 1.8949029049387933e-05, + "loss": 0.6741, + "step": 2228 + }, + { + "epoch": 0.17, + "grad_norm": 1.3543312510706706, + "learning_rate": 1.8947907460042558e-05, + "loss": 0.5988, + "step": 2229 + }, + { + "epoch": 0.17, + "grad_norm": 1.3573715529959696, + "learning_rate": 1.8946785305769616e-05, + "loss": 0.6502, + "step": 2230 + }, + { + "epoch": 0.17, + "grad_norm": 1.2871090045285887, + "learning_rate": 1.894566258663996e-05, + "loss": 0.6669, + "step": 2231 + }, + { + "epoch": 0.17, + "grad_norm": 1.20930375670513, + "learning_rate": 1.894453930272447e-05, + "loss": 0.5849, + "step": 2232 + }, + { + "epoch": 0.17, + "grad_norm": 1.2904696558089934, + "learning_rate": 1.8943415454094068e-05, + "loss": 0.5952, + "step": 2233 + }, + { + "epoch": 0.17, + "grad_norm": 1.3465359218946358, + "learning_rate": 1.894229104081971e-05, + "loss": 0.6687, + "step": 2234 + }, + { + "epoch": 0.17, + "grad_norm": 1.3892760431259157, + "learning_rate": 1.8941166062972374e-05, + "loss": 0.6687, + "step": 2235 + }, + { + "epoch": 0.17, + "grad_norm": 1.172477719760186, + "learning_rate": 1.89400405206231e-05, + "loss": 0.5805, + "step": 2236 + }, + { + "epoch": 0.17, + "grad_norm": 1.329783089022568, + "learning_rate": 1.893891441384294e-05, + "loss": 0.6709, + "step": 2237 + }, + { + "epoch": 0.17, + "grad_norm": 1.1965713354797212, + "learning_rate": 1.8937787742703e-05, + "loss": 0.5919, + "step": 2238 + }, + { + "epoch": 0.17, + "grad_norm": 1.1834011109959846, + "learning_rate": 1.8936660507274403e-05, + "loss": 0.6151, + "step": 2239 + }, + { + "epoch": 0.17, + "grad_norm": 1.2269391010191004, + "learning_rate": 1.8935532707628322e-05, + "loss": 0.6235, + "step": 2240 + }, + { + "epoch": 0.17, + "grad_norm": 1.398545969587041, + "learning_rate": 1.8934404343835956e-05, + "loss": 0.7054, + "step": 2241 + }, + { + "epoch": 0.17, + "grad_norm": 1.2660875299426408, + "learning_rate": 1.893327541596855e-05, + "loss": 0.675, + "step": 2242 + }, + { + "epoch": 0.17, + "grad_norm": 1.350481736803621, + "learning_rate": 1.893214592409738e-05, + "loss": 0.6923, + "step": 2243 + }, + { + "epoch": 0.17, + "grad_norm": 1.257509230547599, + "learning_rate": 1.8931015868293755e-05, + "loss": 0.6031, + "step": 2244 + }, + { + "epoch": 0.17, + "grad_norm": 1.2712820803365117, + "learning_rate": 1.8929885248629017e-05, + "loss": 0.6541, + "step": 2245 + }, + { + "epoch": 0.17, + "grad_norm": 1.3138848673432098, + "learning_rate": 1.8928754065174552e-05, + "loss": 0.6442, + "step": 2246 + }, + { + "epoch": 0.17, + "grad_norm": 1.1687879464420263, + "learning_rate": 1.8927622318001778e-05, + "loss": 0.6104, + "step": 2247 + }, + { + "epoch": 0.17, + "grad_norm": 1.1142975033670908, + "learning_rate": 1.8926490007182147e-05, + "loss": 0.5627, + "step": 2248 + }, + { + "epoch": 0.17, + "grad_norm": 1.2697622705217075, + "learning_rate": 1.8925357132787142e-05, + "loss": 0.6007, + "step": 2249 + }, + { + "epoch": 0.17, + "grad_norm": 1.1813173466613234, + "learning_rate": 1.8924223694888297e-05, + "loss": 0.5944, + "step": 2250 + }, + { + "epoch": 0.17, + "grad_norm": 1.5465209247445078, + "learning_rate": 1.8923089693557165e-05, + "loss": 0.6431, + "step": 2251 + }, + { + "epoch": 0.17, + "grad_norm": 1.2957509696437204, + "learning_rate": 1.8921955128865343e-05, + "loss": 0.6926, + "step": 2252 + }, + { + "epoch": 0.17, + "grad_norm": 1.2173761302434725, + "learning_rate": 1.8920820000884467e-05, + "loss": 0.5983, + "step": 2253 + }, + { + "epoch": 0.17, + "grad_norm": 1.177543156474835, + "learning_rate": 1.8919684309686193e-05, + "loss": 0.5842, + "step": 2254 + }, + { + "epoch": 0.17, + "grad_norm": 1.412638542672742, + "learning_rate": 1.8918548055342232e-05, + "loss": 0.6856, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 1.3146443334211353, + "learning_rate": 1.891741123792432e-05, + "loss": 0.6706, + "step": 2256 + }, + { + "epoch": 0.18, + "grad_norm": 1.2936881561977949, + "learning_rate": 1.891627385750423e-05, + "loss": 0.6069, + "step": 2257 + }, + { + "epoch": 0.18, + "grad_norm": 1.334859491161905, + "learning_rate": 1.8915135914153766e-05, + "loss": 0.7092, + "step": 2258 + }, + { + "epoch": 0.18, + "grad_norm": 1.3024241751567636, + "learning_rate": 1.8913997407944776e-05, + "loss": 0.6325, + "step": 2259 + }, + { + "epoch": 0.18, + "grad_norm": 1.251937128164181, + "learning_rate": 1.891285833894914e-05, + "loss": 0.6681, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 1.2178926631316538, + "learning_rate": 1.8911718707238772e-05, + "loss": 0.6195, + "step": 2261 + }, + { + "epoch": 0.18, + "grad_norm": 1.358931765150668, + "learning_rate": 1.8910578512885624e-05, + "loss": 0.6746, + "step": 2262 + }, + { + "epoch": 0.18, + "grad_norm": 1.2550699888524264, + "learning_rate": 1.8909437755961683e-05, + "loss": 0.5919, + "step": 2263 + }, + { + "epoch": 0.18, + "grad_norm": 1.1356355940178202, + "learning_rate": 1.890829643653897e-05, + "loss": 0.642, + "step": 2264 + }, + { + "epoch": 0.18, + "grad_norm": 1.2172656189673872, + "learning_rate": 1.890715455468954e-05, + "loss": 0.628, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 1.258034491052529, + "learning_rate": 1.890601211048549e-05, + "loss": 0.6401, + "step": 2266 + }, + { + "epoch": 0.18, + "grad_norm": 1.333558937695525, + "learning_rate": 1.8904869103998947e-05, + "loss": 0.691, + "step": 2267 + }, + { + "epoch": 0.18, + "grad_norm": 1.2932833378180306, + "learning_rate": 1.8903725535302073e-05, + "loss": 0.6411, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 1.2341741084549087, + "learning_rate": 1.8902581404467067e-05, + "loss": 0.6379, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 1.2396277198003882, + "learning_rate": 1.890143671156617e-05, + "loss": 0.5427, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 1.2724096912946752, + "learning_rate": 1.8900291456671645e-05, + "loss": 0.5593, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 1.2147699560585856, + "learning_rate": 1.8899145639855803e-05, + "loss": 0.6228, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 1.3003643845683441, + "learning_rate": 1.889799926119098e-05, + "loss": 0.5676, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 1.385570168655623, + "learning_rate": 1.8896852320749558e-05, + "loss": 0.6653, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 1.2159537000907827, + "learning_rate": 1.8895704818603947e-05, + "loss": 0.6018, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 1.346308468228186, + "learning_rate": 1.8894556754826597e-05, + "loss": 0.6397, + "step": 2276 + }, + { + "epoch": 0.18, + "grad_norm": 1.331505903980833, + "learning_rate": 1.889340812948999e-05, + "loss": 0.6712, + "step": 2277 + }, + { + "epoch": 0.18, + "grad_norm": 1.337518080473291, + "learning_rate": 1.8892258942666637e-05, + "loss": 0.6874, + "step": 2278 + }, + { + "epoch": 0.18, + "grad_norm": 1.3073913971853512, + "learning_rate": 1.8891109194429102e-05, + "loss": 0.6233, + "step": 2279 + }, + { + "epoch": 0.18, + "grad_norm": 1.2000252065395383, + "learning_rate": 1.8889958884849972e-05, + "loss": 0.6291, + "step": 2280 + }, + { + "epoch": 0.18, + "grad_norm": 1.2879512273930918, + "learning_rate": 1.8888808014001874e-05, + "loss": 0.6618, + "step": 2281 + }, + { + "epoch": 0.18, + "grad_norm": 1.3074045271872268, + "learning_rate": 1.8887656581957464e-05, + "loss": 0.7366, + "step": 2282 + }, + { + "epoch": 0.18, + "grad_norm": 1.334403646550239, + "learning_rate": 1.888650458878944e-05, + "loss": 0.6571, + "step": 2283 + }, + { + "epoch": 0.18, + "grad_norm": 1.1870009728984494, + "learning_rate": 1.888535203457053e-05, + "loss": 0.5928, + "step": 2284 + }, + { + "epoch": 0.18, + "grad_norm": 1.2679478087138498, + "learning_rate": 1.8884198919373507e-05, + "loss": 0.6118, + "step": 2285 + }, + { + "epoch": 0.18, + "grad_norm": 1.2155057323953022, + "learning_rate": 1.8883045243271168e-05, + "loss": 0.6207, + "step": 2286 + }, + { + "epoch": 0.18, + "grad_norm": 1.3155058546323852, + "learning_rate": 1.8881891006336354e-05, + "loss": 0.6429, + "step": 2287 + }, + { + "epoch": 0.18, + "grad_norm": 1.3744120641327664, + "learning_rate": 1.8880736208641937e-05, + "loss": 0.6492, + "step": 2288 + }, + { + "epoch": 0.18, + "grad_norm": 1.2442458750522478, + "learning_rate": 1.887958085026082e-05, + "loss": 0.6134, + "step": 2289 + }, + { + "epoch": 0.18, + "grad_norm": 1.3569818675489023, + "learning_rate": 1.8878424931265957e-05, + "loss": 0.6853, + "step": 2290 + }, + { + "epoch": 0.18, + "grad_norm": 1.3909016880420917, + "learning_rate": 1.887726845173032e-05, + "loss": 0.691, + "step": 2291 + }, + { + "epoch": 0.18, + "grad_norm": 1.1964597494621914, + "learning_rate": 1.887611141172692e-05, + "loss": 0.6377, + "step": 2292 + }, + { + "epoch": 0.18, + "grad_norm": 1.2263453102985502, + "learning_rate": 1.8874953811328817e-05, + "loss": 0.6161, + "step": 2293 + }, + { + "epoch": 0.18, + "grad_norm": 1.3458127889255969, + "learning_rate": 1.8873795650609092e-05, + "loss": 0.7277, + "step": 2294 + }, + { + "epoch": 0.18, + "grad_norm": 1.2911352889889798, + "learning_rate": 1.8872636929640863e-05, + "loss": 0.6348, + "step": 2295 + }, + { + "epoch": 0.18, + "grad_norm": 1.3019295970440106, + "learning_rate": 1.8871477648497285e-05, + "loss": 0.674, + "step": 2296 + }, + { + "epoch": 0.18, + "grad_norm": 1.2017848489944882, + "learning_rate": 1.8870317807251555e-05, + "loss": 0.6484, + "step": 2297 + }, + { + "epoch": 0.18, + "grad_norm": 1.1973057004310674, + "learning_rate": 1.8869157405976896e-05, + "loss": 0.6222, + "step": 2298 + }, + { + "epoch": 0.18, + "grad_norm": 1.2511482210332108, + "learning_rate": 1.8867996444746574e-05, + "loss": 0.598, + "step": 2299 + }, + { + "epoch": 0.18, + "grad_norm": 1.2101373797842334, + "learning_rate": 1.886683492363388e-05, + "loss": 0.6136, + "step": 2300 + }, + { + "epoch": 0.18, + "grad_norm": 1.2807353777254566, + "learning_rate": 1.886567284271215e-05, + "loss": 0.6396, + "step": 2301 + }, + { + "epoch": 0.18, + "grad_norm": 1.2229585608996965, + "learning_rate": 1.886451020205476e-05, + "loss": 0.6031, + "step": 2302 + }, + { + "epoch": 0.18, + "grad_norm": 1.4295672590211297, + "learning_rate": 1.8863347001735098e-05, + "loss": 0.6923, + "step": 2303 + }, + { + "epoch": 0.18, + "grad_norm": 1.24686252712044, + "learning_rate": 1.8862183241826613e-05, + "loss": 0.6227, + "step": 2304 + }, + { + "epoch": 0.18, + "grad_norm": 1.2491685009095252, + "learning_rate": 1.8861018922402776e-05, + "loss": 0.6716, + "step": 2305 + }, + { + "epoch": 0.18, + "grad_norm": 1.323084342522879, + "learning_rate": 1.88598540435371e-05, + "loss": 0.6527, + "step": 2306 + }, + { + "epoch": 0.18, + "grad_norm": 1.317985425118884, + "learning_rate": 1.8858688605303127e-05, + "loss": 0.6852, + "step": 2307 + }, + { + "epoch": 0.18, + "grad_norm": 1.5015658312074862, + "learning_rate": 1.8857522607774438e-05, + "loss": 0.6642, + "step": 2308 + }, + { + "epoch": 0.18, + "grad_norm": 1.2061895009419612, + "learning_rate": 1.8856356051024646e-05, + "loss": 0.6094, + "step": 2309 + }, + { + "epoch": 0.18, + "grad_norm": 1.1317907526773971, + "learning_rate": 1.8855188935127405e-05, + "loss": 0.5303, + "step": 2310 + }, + { + "epoch": 0.18, + "grad_norm": 1.247687823923802, + "learning_rate": 1.88540212601564e-05, + "loss": 0.6095, + "step": 2311 + }, + { + "epoch": 0.18, + "grad_norm": 1.2911394899537405, + "learning_rate": 1.8852853026185348e-05, + "loss": 0.6758, + "step": 2312 + }, + { + "epoch": 0.18, + "grad_norm": 1.2499499310956306, + "learning_rate": 1.8851684233288016e-05, + "loss": 0.5879, + "step": 2313 + }, + { + "epoch": 0.18, + "grad_norm": 1.4953969423373774, + "learning_rate": 1.8850514881538186e-05, + "loss": 0.6632, + "step": 2314 + }, + { + "epoch": 0.18, + "grad_norm": 1.1763402174141706, + "learning_rate": 1.884934497100969e-05, + "loss": 0.5729, + "step": 2315 + }, + { + "epoch": 0.18, + "grad_norm": 1.1899228225653142, + "learning_rate": 1.8848174501776388e-05, + "loss": 0.6233, + "step": 2316 + }, + { + "epoch": 0.18, + "grad_norm": 1.3122335799344167, + "learning_rate": 1.8847003473912182e-05, + "loss": 0.5731, + "step": 2317 + }, + { + "epoch": 0.18, + "grad_norm": 1.2591900597416277, + "learning_rate": 1.8845831887490998e-05, + "loss": 0.6578, + "step": 2318 + }, + { + "epoch": 0.18, + "grad_norm": 1.2976041835051901, + "learning_rate": 1.8844659742586813e-05, + "loss": 0.6085, + "step": 2319 + }, + { + "epoch": 0.18, + "grad_norm": 1.1819868100845572, + "learning_rate": 1.8843487039273626e-05, + "loss": 0.61, + "step": 2320 + }, + { + "epoch": 0.18, + "grad_norm": 1.3166800647166592, + "learning_rate": 1.884231377762547e-05, + "loss": 0.6964, + "step": 2321 + }, + { + "epoch": 0.18, + "grad_norm": 1.3207310571794146, + "learning_rate": 1.884113995771643e-05, + "loss": 0.6904, + "step": 2322 + }, + { + "epoch": 0.18, + "grad_norm": 1.2725567726705949, + "learning_rate": 1.883996557962061e-05, + "loss": 0.6333, + "step": 2323 + }, + { + "epoch": 0.18, + "grad_norm": 1.2219432665396297, + "learning_rate": 1.8838790643412152e-05, + "loss": 0.6419, + "step": 2324 + }, + { + "epoch": 0.18, + "grad_norm": 1.351336112935263, + "learning_rate": 1.883761514916524e-05, + "loss": 0.6341, + "step": 2325 + }, + { + "epoch": 0.18, + "grad_norm": 1.3525928252514245, + "learning_rate": 1.8836439096954086e-05, + "loss": 0.6883, + "step": 2326 + }, + { + "epoch": 0.18, + "grad_norm": 1.2945120552771745, + "learning_rate": 1.8835262486852944e-05, + "loss": 0.6079, + "step": 2327 + }, + { + "epoch": 0.18, + "grad_norm": 1.2097145550165276, + "learning_rate": 1.8834085318936096e-05, + "loss": 0.5955, + "step": 2328 + }, + { + "epoch": 0.18, + "grad_norm": 1.2838586442210311, + "learning_rate": 1.8832907593277868e-05, + "loss": 0.6218, + "step": 2329 + }, + { + "epoch": 0.18, + "grad_norm": 1.2159462001845165, + "learning_rate": 1.8831729309952605e-05, + "loss": 0.6233, + "step": 2330 + }, + { + "epoch": 0.18, + "grad_norm": 1.3050101789172515, + "learning_rate": 1.883055046903471e-05, + "loss": 0.6007, + "step": 2331 + }, + { + "epoch": 0.18, + "grad_norm": 1.257966500089446, + "learning_rate": 1.8829371070598604e-05, + "loss": 0.5754, + "step": 2332 + }, + { + "epoch": 0.18, + "grad_norm": 1.2911298415637713, + "learning_rate": 1.8828191114718747e-05, + "loss": 0.6959, + "step": 2333 + }, + { + "epoch": 0.18, + "grad_norm": 1.2363132277038633, + "learning_rate": 1.8827010601469634e-05, + "loss": 0.5912, + "step": 2334 + }, + { + "epoch": 0.18, + "grad_norm": 1.3521765488211308, + "learning_rate": 1.882582953092581e-05, + "loss": 0.6586, + "step": 2335 + }, + { + "epoch": 0.18, + "grad_norm": 1.117424466270734, + "learning_rate": 1.8824647903161824e-05, + "loss": 0.5572, + "step": 2336 + }, + { + "epoch": 0.18, + "grad_norm": 1.2016136783569273, + "learning_rate": 1.882346571825229e-05, + "loss": 0.6283, + "step": 2337 + }, + { + "epoch": 0.18, + "grad_norm": 1.2291891559332284, + "learning_rate": 1.882228297627184e-05, + "loss": 0.6335, + "step": 2338 + }, + { + "epoch": 0.18, + "grad_norm": 1.3391902653423522, + "learning_rate": 1.882109967729515e-05, + "loss": 0.6329, + "step": 2339 + }, + { + "epoch": 0.18, + "grad_norm": 1.2842046118808663, + "learning_rate": 1.881991582139693e-05, + "loss": 0.5719, + "step": 2340 + }, + { + "epoch": 0.18, + "grad_norm": 1.338455192454326, + "learning_rate": 1.8818731408651914e-05, + "loss": 0.69, + "step": 2341 + }, + { + "epoch": 0.18, + "grad_norm": 1.3343924349286822, + "learning_rate": 1.8817546439134883e-05, + "loss": 0.6427, + "step": 2342 + }, + { + "epoch": 0.18, + "grad_norm": 1.3439261742365523, + "learning_rate": 1.881636091292066e-05, + "loss": 0.6574, + "step": 2343 + }, + { + "epoch": 0.18, + "grad_norm": 1.379812704520425, + "learning_rate": 1.8815174830084084e-05, + "loss": 0.6522, + "step": 2344 + }, + { + "epoch": 0.18, + "grad_norm": 1.329980384495578, + "learning_rate": 1.881398819070004e-05, + "loss": 0.6854, + "step": 2345 + }, + { + "epoch": 0.18, + "grad_norm": 1.31596865299829, + "learning_rate": 1.8812800994843446e-05, + "loss": 0.6494, + "step": 2346 + }, + { + "epoch": 0.18, + "grad_norm": 1.3828628013643132, + "learning_rate": 1.8811613242589257e-05, + "loss": 0.6334, + "step": 2347 + }, + { + "epoch": 0.18, + "grad_norm": 1.3122544285966038, + "learning_rate": 1.8810424934012464e-05, + "loss": 0.6726, + "step": 2348 + }, + { + "epoch": 0.18, + "grad_norm": 1.4490422143888047, + "learning_rate": 1.880923606918809e-05, + "loss": 0.6331, + "step": 2349 + }, + { + "epoch": 0.18, + "grad_norm": 1.2305817567309718, + "learning_rate": 1.8808046648191193e-05, + "loss": 0.6084, + "step": 2350 + }, + { + "epoch": 0.18, + "grad_norm": 1.406349771986899, + "learning_rate": 1.8806856671096866e-05, + "loss": 0.6066, + "step": 2351 + }, + { + "epoch": 0.18, + "grad_norm": 1.4005646537639251, + "learning_rate": 1.880566613798024e-05, + "loss": 0.7023, + "step": 2352 + }, + { + "epoch": 0.18, + "grad_norm": 1.153702842848037, + "learning_rate": 1.880447504891648e-05, + "loss": 0.5712, + "step": 2353 + }, + { + "epoch": 0.18, + "grad_norm": 1.1825560499621932, + "learning_rate": 1.8803283403980784e-05, + "loss": 0.6223, + "step": 2354 + }, + { + "epoch": 0.18, + "grad_norm": 1.3774579793263662, + "learning_rate": 1.8802091203248388e-05, + "loss": 0.6601, + "step": 2355 + }, + { + "epoch": 0.18, + "grad_norm": 1.1526030167983166, + "learning_rate": 1.880089844679456e-05, + "loss": 0.5768, + "step": 2356 + }, + { + "epoch": 0.18, + "grad_norm": 1.30984746743837, + "learning_rate": 1.879970513469461e-05, + "loss": 0.6519, + "step": 2357 + }, + { + "epoch": 0.18, + "grad_norm": 1.3555014218836876, + "learning_rate": 1.8798511267023874e-05, + "loss": 0.6513, + "step": 2358 + }, + { + "epoch": 0.18, + "grad_norm": 1.2281907566317205, + "learning_rate": 1.8797316843857723e-05, + "loss": 0.566, + "step": 2359 + }, + { + "epoch": 0.18, + "grad_norm": 1.382401701690258, + "learning_rate": 1.8796121865271578e-05, + "loss": 0.6631, + "step": 2360 + }, + { + "epoch": 0.18, + "grad_norm": 1.1713213820878148, + "learning_rate": 1.8794926331340874e-05, + "loss": 0.5769, + "step": 2361 + }, + { + "epoch": 0.18, + "grad_norm": 1.3037626906277462, + "learning_rate": 1.8793730242141093e-05, + "loss": 0.6527, + "step": 2362 + }, + { + "epoch": 0.18, + "grad_norm": 1.2611793333432209, + "learning_rate": 1.8792533597747756e-05, + "loss": 0.6272, + "step": 2363 + }, + { + "epoch": 0.18, + "grad_norm": 1.297287840797995, + "learning_rate": 1.879133639823641e-05, + "loss": 0.5832, + "step": 2364 + }, + { + "epoch": 0.18, + "grad_norm": 1.3728048835749782, + "learning_rate": 1.8790138643682633e-05, + "loss": 0.6323, + "step": 2365 + }, + { + "epoch": 0.18, + "grad_norm": 1.1537831772221736, + "learning_rate": 1.878894033416206e-05, + "loss": 0.581, + "step": 2366 + }, + { + "epoch": 0.18, + "grad_norm": 1.3256932604245857, + "learning_rate": 1.8787741469750332e-05, + "loss": 0.6431, + "step": 2367 + }, + { + "epoch": 0.18, + "grad_norm": 1.2715987990498159, + "learning_rate": 1.8786542050523152e-05, + "loss": 0.6142, + "step": 2368 + }, + { + "epoch": 0.18, + "grad_norm": 1.3999320201036762, + "learning_rate": 1.8785342076556236e-05, + "loss": 0.6424, + "step": 2369 + }, + { + "epoch": 0.18, + "grad_norm": 1.3710288877441834, + "learning_rate": 1.878414154792535e-05, + "loss": 0.7017, + "step": 2370 + }, + { + "epoch": 0.18, + "grad_norm": 1.368623731180025, + "learning_rate": 1.878294046470629e-05, + "loss": 0.6635, + "step": 2371 + }, + { + "epoch": 0.18, + "grad_norm": 1.3156077965995685, + "learning_rate": 1.878173882697488e-05, + "loss": 0.5893, + "step": 2372 + }, + { + "epoch": 0.18, + "grad_norm": 1.2967169332118074, + "learning_rate": 1.8780536634806995e-05, + "loss": 0.626, + "step": 2373 + }, + { + "epoch": 0.18, + "grad_norm": 1.1316281671862123, + "learning_rate": 1.8779333888278524e-05, + "loss": 0.5864, + "step": 2374 + }, + { + "epoch": 0.18, + "grad_norm": 1.2839720119325606, + "learning_rate": 1.8778130587465414e-05, + "loss": 0.6651, + "step": 2375 + }, + { + "epoch": 0.18, + "grad_norm": 1.1422180129957835, + "learning_rate": 1.877692673244363e-05, + "loss": 0.6014, + "step": 2376 + }, + { + "epoch": 0.18, + "grad_norm": 1.275558343815793, + "learning_rate": 1.877572232328918e-05, + "loss": 0.5993, + "step": 2377 + }, + { + "epoch": 0.18, + "grad_norm": 1.1297881697546033, + "learning_rate": 1.8774517360078098e-05, + "loss": 0.5735, + "step": 2378 + }, + { + "epoch": 0.18, + "grad_norm": 1.2213090781190319, + "learning_rate": 1.877331184288647e-05, + "loss": 0.5763, + "step": 2379 + }, + { + "epoch": 0.18, + "grad_norm": 1.2156443518346145, + "learning_rate": 1.8772105771790397e-05, + "loss": 0.5928, + "step": 2380 + }, + { + "epoch": 0.18, + "grad_norm": 1.2991646466943985, + "learning_rate": 1.877089914686603e-05, + "loss": 0.6339, + "step": 2381 + }, + { + "epoch": 0.18, + "grad_norm": 1.2946729694115477, + "learning_rate": 1.8769691968189548e-05, + "loss": 0.638, + "step": 2382 + }, + { + "epoch": 0.18, + "grad_norm": 1.2586145627783092, + "learning_rate": 1.876848423583717e-05, + "loss": 0.5576, + "step": 2383 + }, + { + "epoch": 0.18, + "grad_norm": 1.2176354031006298, + "learning_rate": 1.8767275949885136e-05, + "loss": 0.6671, + "step": 2384 + }, + { + "epoch": 0.19, + "grad_norm": 1.2821997402201015, + "learning_rate": 1.876606711040974e-05, + "loss": 0.6781, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 1.226713912268256, + "learning_rate": 1.8764857717487304e-05, + "loss": 0.607, + "step": 2386 + }, + { + "epoch": 0.19, + "grad_norm": 1.2588977751979855, + "learning_rate": 1.8763647771194177e-05, + "loss": 0.6237, + "step": 2387 + }, + { + "epoch": 0.19, + "grad_norm": 1.2773613884784183, + "learning_rate": 1.8762437271606752e-05, + "loss": 0.6581, + "step": 2388 + }, + { + "epoch": 0.19, + "grad_norm": 1.2282736438359942, + "learning_rate": 1.8761226218801455e-05, + "loss": 0.5936, + "step": 2389 + }, + { + "epoch": 0.19, + "grad_norm": 1.267149442766819, + "learning_rate": 1.8760014612854746e-05, + "loss": 0.6818, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 1.32343568407675, + "learning_rate": 1.8758802453843112e-05, + "loss": 0.6849, + "step": 2391 + }, + { + "epoch": 0.19, + "grad_norm": 1.1820451028419297, + "learning_rate": 1.8757589741843095e-05, + "loss": 0.6348, + "step": 2392 + }, + { + "epoch": 0.19, + "grad_norm": 1.258630758886283, + "learning_rate": 1.8756376476931252e-05, + "loss": 0.6219, + "step": 2393 + }, + { + "epoch": 0.19, + "grad_norm": 1.2469574138586004, + "learning_rate": 1.8755162659184186e-05, + "loss": 0.6137, + "step": 2394 + }, + { + "epoch": 0.19, + "grad_norm": 1.2272257864127885, + "learning_rate": 1.8753948288678533e-05, + "loss": 0.6553, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 1.2942705314323744, + "learning_rate": 1.8752733365490957e-05, + "loss": 0.631, + "step": 2396 + }, + { + "epoch": 0.19, + "grad_norm": 1.2651904737741642, + "learning_rate": 1.875151788969817e-05, + "loss": 0.6386, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 1.2810628335597505, + "learning_rate": 1.8750301861376903e-05, + "loss": 0.699, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 1.2614042758252852, + "learning_rate": 1.8749085280603935e-05, + "loss": 0.6301, + "step": 2399 + }, + { + "epoch": 0.19, + "grad_norm": 1.2883318593278739, + "learning_rate": 1.8747868147456072e-05, + "loss": 0.63, + "step": 2400 + }, + { + "epoch": 0.19, + "grad_norm": 1.2328140916318495, + "learning_rate": 1.874665046201016e-05, + "loss": 0.632, + "step": 2401 + }, + { + "epoch": 0.19, + "grad_norm": 1.3114522657589867, + "learning_rate": 1.8745432224343082e-05, + "loss": 0.6699, + "step": 2402 + }, + { + "epoch": 0.19, + "grad_norm": 1.2232823338042742, + "learning_rate": 1.874421343453174e-05, + "loss": 0.5913, + "step": 2403 + }, + { + "epoch": 0.19, + "grad_norm": 1.3775435676538899, + "learning_rate": 1.8742994092653096e-05, + "loss": 0.6376, + "step": 2404 + }, + { + "epoch": 0.19, + "grad_norm": 1.1533099245900462, + "learning_rate": 1.874177419878412e-05, + "loss": 0.5181, + "step": 2405 + }, + { + "epoch": 0.19, + "grad_norm": 1.2966567166019398, + "learning_rate": 1.8740553753001844e-05, + "loss": 0.6079, + "step": 2406 + }, + { + "epoch": 0.19, + "grad_norm": 1.179352415886464, + "learning_rate": 1.873933275538331e-05, + "loss": 0.5939, + "step": 2407 + }, + { + "epoch": 0.19, + "grad_norm": 1.2199868015450879, + "learning_rate": 1.8738111206005615e-05, + "loss": 0.5642, + "step": 2408 + }, + { + "epoch": 0.19, + "grad_norm": 1.2386872982627628, + "learning_rate": 1.8736889104945874e-05, + "loss": 0.6102, + "step": 2409 + }, + { + "epoch": 0.19, + "grad_norm": 1.3689206718048275, + "learning_rate": 1.8735666452281246e-05, + "loss": 0.677, + "step": 2410 + }, + { + "epoch": 0.19, + "grad_norm": 1.320869689384897, + "learning_rate": 1.8734443248088926e-05, + "loss": 0.6654, + "step": 2411 + }, + { + "epoch": 0.19, + "grad_norm": 1.3809177758629052, + "learning_rate": 1.873321949244614e-05, + "loss": 0.6641, + "step": 2412 + }, + { + "epoch": 0.19, + "grad_norm": 1.3519685168466142, + "learning_rate": 1.8731995185430148e-05, + "loss": 0.5993, + "step": 2413 + }, + { + "epoch": 0.19, + "grad_norm": 1.2307249256673876, + "learning_rate": 1.8730770327118254e-05, + "loss": 0.6285, + "step": 2414 + }, + { + "epoch": 0.19, + "grad_norm": 1.2267889797780407, + "learning_rate": 1.8729544917587778e-05, + "loss": 0.6415, + "step": 2415 + }, + { + "epoch": 0.19, + "grad_norm": 1.2773785600964451, + "learning_rate": 1.8728318956916096e-05, + "loss": 0.5713, + "step": 2416 + }, + { + "epoch": 0.19, + "grad_norm": 1.2343488762903918, + "learning_rate": 1.8727092445180605e-05, + "loss": 0.5605, + "step": 2417 + }, + { + "epoch": 0.19, + "grad_norm": 1.4341248404244182, + "learning_rate": 1.8725865382458744e-05, + "loss": 0.6518, + "step": 2418 + }, + { + "epoch": 0.19, + "grad_norm": 1.3182418205691848, + "learning_rate": 1.8724637768827982e-05, + "loss": 0.632, + "step": 2419 + }, + { + "epoch": 0.19, + "grad_norm": 1.1458106067600007, + "learning_rate": 1.8723409604365823e-05, + "loss": 0.5761, + "step": 2420 + }, + { + "epoch": 0.19, + "grad_norm": 1.330170525969723, + "learning_rate": 1.872218088914981e-05, + "loss": 0.6906, + "step": 2421 + }, + { + "epoch": 0.19, + "grad_norm": 1.2021378269548495, + "learning_rate": 1.8720951623257514e-05, + "loss": 0.6223, + "step": 2422 + }, + { + "epoch": 0.19, + "grad_norm": 1.1857673150989387, + "learning_rate": 1.871972180676655e-05, + "loss": 0.5929, + "step": 2423 + }, + { + "epoch": 0.19, + "grad_norm": 1.2485561137851693, + "learning_rate": 1.8718491439754562e-05, + "loss": 0.6092, + "step": 2424 + }, + { + "epoch": 0.19, + "grad_norm": 1.275888342955678, + "learning_rate": 1.8717260522299226e-05, + "loss": 0.6699, + "step": 2425 + }, + { + "epoch": 0.19, + "grad_norm": 1.2705311754733652, + "learning_rate": 1.8716029054478257e-05, + "loss": 0.6836, + "step": 2426 + }, + { + "epoch": 0.19, + "grad_norm": 1.2274701593038857, + "learning_rate": 1.8714797036369407e-05, + "loss": 0.6395, + "step": 2427 + }, + { + "epoch": 0.19, + "grad_norm": 1.1634775522588696, + "learning_rate": 1.8713564468050455e-05, + "loss": 0.5808, + "step": 2428 + }, + { + "epoch": 0.19, + "grad_norm": 1.237808955764608, + "learning_rate": 1.8712331349599227e-05, + "loss": 0.6695, + "step": 2429 + }, + { + "epoch": 0.19, + "grad_norm": 1.2227718798283174, + "learning_rate": 1.8711097681093568e-05, + "loss": 0.6011, + "step": 2430 + }, + { + "epoch": 0.19, + "grad_norm": 1.1968562809448942, + "learning_rate": 1.8709863462611368e-05, + "loss": 0.6742, + "step": 2431 + }, + { + "epoch": 0.19, + "grad_norm": 1.4422217235864745, + "learning_rate": 1.8708628694230554e-05, + "loss": 0.6482, + "step": 2432 + }, + { + "epoch": 0.19, + "grad_norm": 1.2474815747996928, + "learning_rate": 1.8707393376029076e-05, + "loss": 0.6287, + "step": 2433 + }, + { + "epoch": 0.19, + "grad_norm": 1.1562748983640754, + "learning_rate": 1.8706157508084934e-05, + "loss": 0.6359, + "step": 2434 + }, + { + "epoch": 0.19, + "grad_norm": 1.2451794178688973, + "learning_rate": 1.8704921090476148e-05, + "loss": 0.6593, + "step": 2435 + }, + { + "epoch": 0.19, + "grad_norm": 1.232030017298225, + "learning_rate": 1.870368412328078e-05, + "loss": 0.6008, + "step": 2436 + }, + { + "epoch": 0.19, + "grad_norm": 1.139302623749709, + "learning_rate": 1.870244660657693e-05, + "loss": 0.5903, + "step": 2437 + }, + { + "epoch": 0.19, + "grad_norm": 1.2632565412776657, + "learning_rate": 1.870120854044273e-05, + "loss": 0.6103, + "step": 2438 + }, + { + "epoch": 0.19, + "grad_norm": 1.3128410532024872, + "learning_rate": 1.869996992495634e-05, + "loss": 0.6331, + "step": 2439 + }, + { + "epoch": 0.19, + "grad_norm": 1.3096785418841905, + "learning_rate": 1.8698730760195963e-05, + "loss": 0.6391, + "step": 2440 + }, + { + "epoch": 0.19, + "grad_norm": 1.2535515874861367, + "learning_rate": 1.869749104623983e-05, + "loss": 0.6135, + "step": 2441 + }, + { + "epoch": 0.19, + "grad_norm": 1.2570058000926945, + "learning_rate": 1.8696250783166218e-05, + "loss": 0.6095, + "step": 2442 + }, + { + "epoch": 0.19, + "grad_norm": 1.2589870678916961, + "learning_rate": 1.869500997105343e-05, + "loss": 0.6358, + "step": 2443 + }, + { + "epoch": 0.19, + "grad_norm": 1.1986070992874267, + "learning_rate": 1.8693768609979796e-05, + "loss": 0.5585, + "step": 2444 + }, + { + "epoch": 0.19, + "grad_norm": 1.1968443286441857, + "learning_rate": 1.86925267000237e-05, + "loss": 0.6206, + "step": 2445 + }, + { + "epoch": 0.19, + "grad_norm": 1.22023404073097, + "learning_rate": 1.8691284241263547e-05, + "loss": 0.6005, + "step": 2446 + }, + { + "epoch": 0.19, + "grad_norm": 1.2934050500950776, + "learning_rate": 1.8690041233777774e-05, + "loss": 0.662, + "step": 2447 + }, + { + "epoch": 0.19, + "grad_norm": 1.2180060047571653, + "learning_rate": 1.8688797677644865e-05, + "loss": 0.5908, + "step": 2448 + }, + { + "epoch": 0.19, + "grad_norm": 1.2152569426915234, + "learning_rate": 1.868755357294333e-05, + "loss": 0.662, + "step": 2449 + }, + { + "epoch": 0.19, + "grad_norm": 1.4536301749942062, + "learning_rate": 1.8686308919751718e-05, + "loss": 0.6754, + "step": 2450 + }, + { + "epoch": 0.19, + "grad_norm": 1.2810053708144795, + "learning_rate": 1.8685063718148608e-05, + "loss": 0.5837, + "step": 2451 + }, + { + "epoch": 0.19, + "grad_norm": 1.3048902799440902, + "learning_rate": 1.8683817968212613e-05, + "loss": 0.713, + "step": 2452 + }, + { + "epoch": 0.19, + "grad_norm": 1.275016180571564, + "learning_rate": 1.868257167002239e-05, + "loss": 0.6437, + "step": 2453 + }, + { + "epoch": 0.19, + "grad_norm": 1.1846074965664053, + "learning_rate": 1.8681324823656623e-05, + "loss": 0.5569, + "step": 2454 + }, + { + "epoch": 0.19, + "grad_norm": 1.2770317699898526, + "learning_rate": 1.8680077429194025e-05, + "loss": 0.6058, + "step": 2455 + }, + { + "epoch": 0.19, + "grad_norm": 1.3043310626480014, + "learning_rate": 1.8678829486713362e-05, + "loss": 0.7058, + "step": 2456 + }, + { + "epoch": 0.19, + "grad_norm": 1.462291151605452, + "learning_rate": 1.867758099629341e-05, + "loss": 0.6179, + "step": 2457 + }, + { + "epoch": 0.19, + "grad_norm": 1.2707998656988748, + "learning_rate": 1.8676331958013004e-05, + "loss": 0.6079, + "step": 2458 + }, + { + "epoch": 0.19, + "grad_norm": 1.2663154602452986, + "learning_rate": 1.8675082371950996e-05, + "loss": 0.6044, + "step": 2459 + }, + { + "epoch": 0.19, + "grad_norm": 1.1410610462878796, + "learning_rate": 1.867383223818628e-05, + "loss": 0.5609, + "step": 2460 + }, + { + "epoch": 0.19, + "grad_norm": 1.3634408677011673, + "learning_rate": 1.8672581556797785e-05, + "loss": 0.6513, + "step": 2461 + }, + { + "epoch": 0.19, + "grad_norm": 1.2271815396157595, + "learning_rate": 1.867133032786447e-05, + "loss": 0.6535, + "step": 2462 + }, + { + "epoch": 0.19, + "grad_norm": 1.2678103476701896, + "learning_rate": 1.8670078551465336e-05, + "loss": 0.6123, + "step": 2463 + }, + { + "epoch": 0.19, + "grad_norm": 1.2029035290083356, + "learning_rate": 1.8668826227679408e-05, + "loss": 0.5776, + "step": 2464 + }, + { + "epoch": 0.19, + "grad_norm": 1.2136313714966078, + "learning_rate": 1.8667573356585756e-05, + "loss": 0.6221, + "step": 2465 + }, + { + "epoch": 0.19, + "grad_norm": 1.3528623559497148, + "learning_rate": 1.866631993826348e-05, + "loss": 0.6203, + "step": 2466 + }, + { + "epoch": 0.19, + "grad_norm": 1.220765818702567, + "learning_rate": 1.8665065972791715e-05, + "loss": 0.6093, + "step": 2467 + }, + { + "epoch": 0.19, + "grad_norm": 1.3541523370229156, + "learning_rate": 1.8663811460249625e-05, + "loss": 0.6144, + "step": 2468 + }, + { + "epoch": 0.19, + "grad_norm": 1.2202700403619804, + "learning_rate": 1.8662556400716423e-05, + "loss": 0.6005, + "step": 2469 + }, + { + "epoch": 0.19, + "grad_norm": 1.277062901410712, + "learning_rate": 1.866130079427134e-05, + "loss": 0.5709, + "step": 2470 + }, + { + "epoch": 0.19, + "grad_norm": 1.2106963378968074, + "learning_rate": 1.866004464099365e-05, + "loss": 0.6306, + "step": 2471 + }, + { + "epoch": 0.19, + "grad_norm": 1.270141361058738, + "learning_rate": 1.8658787940962666e-05, + "loss": 0.6145, + "step": 2472 + }, + { + "epoch": 0.19, + "grad_norm": 1.2725296060531797, + "learning_rate": 1.8657530694257722e-05, + "loss": 0.6286, + "step": 2473 + }, + { + "epoch": 0.19, + "grad_norm": 1.1505708666597803, + "learning_rate": 1.8656272900958202e-05, + "loss": 0.5983, + "step": 2474 + }, + { + "epoch": 0.19, + "grad_norm": 1.211748503781468, + "learning_rate": 1.865501456114351e-05, + "loss": 0.6194, + "step": 2475 + }, + { + "epoch": 0.19, + "grad_norm": 1.246196395807835, + "learning_rate": 1.8653755674893095e-05, + "loss": 0.6014, + "step": 2476 + }, + { + "epoch": 0.19, + "grad_norm": 1.2684632489357968, + "learning_rate": 1.865249624228644e-05, + "loss": 0.6523, + "step": 2477 + }, + { + "epoch": 0.19, + "grad_norm": 1.2319233365110043, + "learning_rate": 1.8651236263403052e-05, + "loss": 0.6185, + "step": 2478 + }, + { + "epoch": 0.19, + "grad_norm": 1.2574923090007528, + "learning_rate": 1.8649975738322487e-05, + "loss": 0.5813, + "step": 2479 + }, + { + "epoch": 0.19, + "grad_norm": 1.2989334866804474, + "learning_rate": 1.8648714667124323e-05, + "loss": 0.6515, + "step": 2480 + }, + { + "epoch": 0.19, + "grad_norm": 1.1623842571434297, + "learning_rate": 1.8647453049888186e-05, + "loss": 0.5784, + "step": 2481 + }, + { + "epoch": 0.19, + "grad_norm": 1.2289409537971798, + "learning_rate": 1.8646190886693717e-05, + "loss": 0.584, + "step": 2482 + }, + { + "epoch": 0.19, + "grad_norm": 1.148413158340434, + "learning_rate": 1.8644928177620612e-05, + "loss": 0.571, + "step": 2483 + }, + { + "epoch": 0.19, + "grad_norm": 1.2768933730033598, + "learning_rate": 1.864366492274859e-05, + "loss": 0.6365, + "step": 2484 + }, + { + "epoch": 0.19, + "grad_norm": 1.3055138626385328, + "learning_rate": 1.86424011221574e-05, + "loss": 0.5909, + "step": 2485 + }, + { + "epoch": 0.19, + "grad_norm": 1.2036296232092925, + "learning_rate": 1.8641136775926844e-05, + "loss": 0.6286, + "step": 2486 + }, + { + "epoch": 0.19, + "grad_norm": 1.1856331958799164, + "learning_rate": 1.863987188413674e-05, + "loss": 0.5756, + "step": 2487 + }, + { + "epoch": 0.19, + "grad_norm": 1.134590962767589, + "learning_rate": 1.8638606446866947e-05, + "loss": 0.4757, + "step": 2488 + }, + { + "epoch": 0.19, + "grad_norm": 1.2513143781626266, + "learning_rate": 1.8637340464197357e-05, + "loss": 0.6306, + "step": 2489 + }, + { + "epoch": 0.19, + "grad_norm": 1.1773843577266503, + "learning_rate": 1.8636073936207907e-05, + "loss": 0.5852, + "step": 2490 + }, + { + "epoch": 0.19, + "grad_norm": 1.1947211005544542, + "learning_rate": 1.863480686297855e-05, + "loss": 0.66, + "step": 2491 + }, + { + "epoch": 0.19, + "grad_norm": 1.2703404591241307, + "learning_rate": 1.8633539244589285e-05, + "loss": 0.5839, + "step": 2492 + }, + { + "epoch": 0.19, + "grad_norm": 1.310701864205226, + "learning_rate": 1.8632271081120144e-05, + "loss": 0.5919, + "step": 2493 + }, + { + "epoch": 0.19, + "grad_norm": 1.2219158526759577, + "learning_rate": 1.8631002372651194e-05, + "loss": 0.6121, + "step": 2494 + }, + { + "epoch": 0.19, + "grad_norm": 1.25539051279771, + "learning_rate": 1.8629733119262536e-05, + "loss": 0.61, + "step": 2495 + }, + { + "epoch": 0.19, + "grad_norm": 1.1989805500657778, + "learning_rate": 1.86284633210343e-05, + "loss": 0.5397, + "step": 2496 + }, + { + "epoch": 0.19, + "grad_norm": 1.220097066347891, + "learning_rate": 1.862719297804666e-05, + "loss": 0.6262, + "step": 2497 + }, + { + "epoch": 0.19, + "grad_norm": 1.3601523621145881, + "learning_rate": 1.8625922090379812e-05, + "loss": 0.6197, + "step": 2498 + }, + { + "epoch": 0.19, + "grad_norm": 1.261808173727957, + "learning_rate": 1.8624650658114003e-05, + "loss": 0.6342, + "step": 2499 + }, + { + "epoch": 0.19, + "grad_norm": 1.097432232461151, + "learning_rate": 1.86233786813295e-05, + "loss": 0.5538, + "step": 2500 + }, + { + "epoch": 0.19, + "grad_norm": 1.4307810452386007, + "learning_rate": 1.862210616010661e-05, + "loss": 0.7265, + "step": 2501 + }, + { + "epoch": 0.19, + "grad_norm": 1.2982976532874413, + "learning_rate": 1.862083309452567e-05, + "loss": 0.6277, + "step": 2502 + }, + { + "epoch": 0.19, + "grad_norm": 1.2739126804975092, + "learning_rate": 1.8619559484667063e-05, + "loss": 0.6756, + "step": 2503 + }, + { + "epoch": 0.19, + "grad_norm": 1.2373585918821108, + "learning_rate": 1.8618285330611193e-05, + "loss": 0.6096, + "step": 2504 + }, + { + "epoch": 0.19, + "grad_norm": 1.3507258741987285, + "learning_rate": 1.8617010632438508e-05, + "loss": 0.6555, + "step": 2505 + }, + { + "epoch": 0.19, + "grad_norm": 1.116901761307012, + "learning_rate": 1.8615735390229483e-05, + "loss": 0.6206, + "step": 2506 + }, + { + "epoch": 0.19, + "grad_norm": 1.1878597818752807, + "learning_rate": 1.8614459604064634e-05, + "loss": 0.5414, + "step": 2507 + }, + { + "epoch": 0.19, + "grad_norm": 1.2555094894512393, + "learning_rate": 1.86131832740245e-05, + "loss": 0.6433, + "step": 2508 + }, + { + "epoch": 0.19, + "grad_norm": 1.2274999741284276, + "learning_rate": 1.861190640018967e-05, + "loss": 0.6259, + "step": 2509 + }, + { + "epoch": 0.19, + "grad_norm": 1.2973945220528185, + "learning_rate": 1.8610628982640758e-05, + "loss": 0.6293, + "step": 2510 + }, + { + "epoch": 0.19, + "grad_norm": 1.1606728787424836, + "learning_rate": 1.8609351021458412e-05, + "loss": 0.5604, + "step": 2511 + }, + { + "epoch": 0.19, + "grad_norm": 1.3960318732571626, + "learning_rate": 1.8608072516723317e-05, + "loss": 0.6043, + "step": 2512 + }, + { + "epoch": 0.19, + "grad_norm": 1.3159074602432916, + "learning_rate": 1.8606793468516197e-05, + "loss": 0.625, + "step": 2513 + }, + { + "epoch": 0.2, + "grad_norm": 1.2911882386111526, + "learning_rate": 1.8605513876917793e-05, + "loss": 0.6286, + "step": 2514 + }, + { + "epoch": 0.2, + "grad_norm": 1.1841470161774408, + "learning_rate": 1.8604233742008903e-05, + "loss": 0.6274, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 1.158395169114741, + "learning_rate": 1.860295306387035e-05, + "loss": 0.6291, + "step": 2516 + }, + { + "epoch": 0.2, + "grad_norm": 1.1611165911004209, + "learning_rate": 1.8601671842582977e-05, + "loss": 0.5685, + "step": 2517 + }, + { + "epoch": 0.2, + "grad_norm": 1.2891511828810491, + "learning_rate": 1.8600390078227682e-05, + "loss": 0.6666, + "step": 2518 + }, + { + "epoch": 0.2, + "grad_norm": 1.2628273832165804, + "learning_rate": 1.8599107770885392e-05, + "loss": 0.5996, + "step": 2519 + }, + { + "epoch": 0.2, + "grad_norm": 1.2303213848697214, + "learning_rate": 1.8597824920637063e-05, + "loss": 0.6127, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 1.2173923364147705, + "learning_rate": 1.8596541527563686e-05, + "loss": 0.5605, + "step": 2521 + }, + { + "epoch": 0.2, + "grad_norm": 1.1828597912222343, + "learning_rate": 1.859525759174629e-05, + "loss": 0.6037, + "step": 2522 + }, + { + "epoch": 0.2, + "grad_norm": 1.3533218927714155, + "learning_rate": 1.8593973113265938e-05, + "loss": 0.5849, + "step": 2523 + }, + { + "epoch": 0.2, + "grad_norm": 1.1939991735874045, + "learning_rate": 1.8592688092203727e-05, + "loss": 0.5959, + "step": 2524 + }, + { + "epoch": 0.2, + "grad_norm": 1.3305723641638398, + "learning_rate": 1.8591402528640782e-05, + "loss": 0.6473, + "step": 2525 + }, + { + "epoch": 0.2, + "grad_norm": 1.2634680464205919, + "learning_rate": 1.8590116422658267e-05, + "loss": 0.5907, + "step": 2526 + }, + { + "epoch": 0.2, + "grad_norm": 1.2424357903850305, + "learning_rate": 1.8588829774337388e-05, + "loss": 0.6681, + "step": 2527 + }, + { + "epoch": 0.2, + "grad_norm": 1.290836708494584, + "learning_rate": 1.858754258375937e-05, + "loss": 0.6205, + "step": 2528 + }, + { + "epoch": 0.2, + "grad_norm": 1.1172271934873748, + "learning_rate": 1.858625485100548e-05, + "loss": 0.5876, + "step": 2529 + }, + { + "epoch": 0.2, + "grad_norm": 1.3565895657496478, + "learning_rate": 1.8584966576157028e-05, + "loss": 0.5838, + "step": 2530 + }, + { + "epoch": 0.2, + "grad_norm": 1.1605733513420058, + "learning_rate": 1.8583677759295337e-05, + "loss": 0.5603, + "step": 2531 + }, + { + "epoch": 0.2, + "grad_norm": 1.1943168732189107, + "learning_rate": 1.8582388400501786e-05, + "loss": 0.5914, + "step": 2532 + }, + { + "epoch": 0.2, + "grad_norm": 1.340174775895629, + "learning_rate": 1.8581098499857776e-05, + "loss": 0.636, + "step": 2533 + }, + { + "epoch": 0.2, + "grad_norm": 1.195819871718148, + "learning_rate": 1.8579808057444744e-05, + "loss": 0.6409, + "step": 2534 + }, + { + "epoch": 0.2, + "grad_norm": 1.3152215533853757, + "learning_rate": 1.8578517073344168e-05, + "loss": 0.5781, + "step": 2535 + }, + { + "epoch": 0.2, + "grad_norm": 1.1787502784769128, + "learning_rate": 1.8577225547637545e-05, + "loss": 0.6261, + "step": 2536 + }, + { + "epoch": 0.2, + "grad_norm": 1.2210191485460118, + "learning_rate": 1.857593348040642e-05, + "loss": 0.6121, + "step": 2537 + }, + { + "epoch": 0.2, + "grad_norm": 1.2325777901325448, + "learning_rate": 1.857464087173237e-05, + "loss": 0.6224, + "step": 2538 + }, + { + "epoch": 0.2, + "grad_norm": 1.2258120628359654, + "learning_rate": 1.8573347721697007e-05, + "loss": 0.5919, + "step": 2539 + }, + { + "epoch": 0.2, + "grad_norm": 1.2111445588315997, + "learning_rate": 1.8572054030381963e-05, + "loss": 0.5985, + "step": 2540 + }, + { + "epoch": 0.2, + "grad_norm": 1.2401532003916247, + "learning_rate": 1.8570759797868925e-05, + "loss": 0.6971, + "step": 2541 + }, + { + "epoch": 0.2, + "grad_norm": 1.1244443474885986, + "learning_rate": 1.8569465024239603e-05, + "loss": 0.5731, + "step": 2542 + }, + { + "epoch": 0.2, + "grad_norm": 1.1371780463882148, + "learning_rate": 1.8568169709575737e-05, + "loss": 0.6314, + "step": 2543 + }, + { + "epoch": 0.2, + "grad_norm": 1.2121776506260318, + "learning_rate": 1.856687385395911e-05, + "loss": 0.6172, + "step": 2544 + }, + { + "epoch": 0.2, + "grad_norm": 1.359451423338358, + "learning_rate": 1.8565577457471545e-05, + "loss": 0.6485, + "step": 2545 + }, + { + "epoch": 0.2, + "grad_norm": 1.2032117936175215, + "learning_rate": 1.8564280520194876e-05, + "loss": 0.5711, + "step": 2546 + }, + { + "epoch": 0.2, + "grad_norm": 1.418571743290778, + "learning_rate": 1.8562983042210998e-05, + "loss": 0.6426, + "step": 2547 + }, + { + "epoch": 0.2, + "grad_norm": 1.3211135691324105, + "learning_rate": 1.8561685023601815e-05, + "loss": 0.5697, + "step": 2548 + }, + { + "epoch": 0.2, + "grad_norm": 1.3007259414415762, + "learning_rate": 1.8560386464449288e-05, + "loss": 0.6598, + "step": 2549 + }, + { + "epoch": 0.2, + "grad_norm": 1.1542516915761523, + "learning_rate": 1.8559087364835397e-05, + "loss": 0.6174, + "step": 2550 + }, + { + "epoch": 0.2, + "grad_norm": 1.3336154321428684, + "learning_rate": 1.855778772484216e-05, + "loss": 0.6085, + "step": 2551 + }, + { + "epoch": 0.2, + "grad_norm": 1.269227023719383, + "learning_rate": 1.8556487544551633e-05, + "loss": 0.5854, + "step": 2552 + }, + { + "epoch": 0.2, + "grad_norm": 1.3994029883873589, + "learning_rate": 1.8555186824045903e-05, + "loss": 0.6521, + "step": 2553 + }, + { + "epoch": 0.2, + "grad_norm": 1.3745360458687568, + "learning_rate": 1.8553885563407086e-05, + "loss": 0.6457, + "step": 2554 + }, + { + "epoch": 0.2, + "grad_norm": 1.1461255423433991, + "learning_rate": 1.8552583762717346e-05, + "loss": 0.5572, + "step": 2555 + }, + { + "epoch": 0.2, + "grad_norm": 1.2501790395308439, + "learning_rate": 1.8551281422058863e-05, + "loss": 0.6272, + "step": 2556 + }, + { + "epoch": 0.2, + "grad_norm": 1.299685959297986, + "learning_rate": 1.8549978541513865e-05, + "loss": 0.6544, + "step": 2557 + }, + { + "epoch": 0.2, + "grad_norm": 1.1721466766790667, + "learning_rate": 1.8548675121164613e-05, + "loss": 0.625, + "step": 2558 + }, + { + "epoch": 0.2, + "grad_norm": 1.2346274685102632, + "learning_rate": 1.8547371161093397e-05, + "loss": 0.6582, + "step": 2559 + }, + { + "epoch": 0.2, + "grad_norm": 1.145131237055101, + "learning_rate": 1.8546066661382535e-05, + "loss": 0.5911, + "step": 2560 + }, + { + "epoch": 0.2, + "grad_norm": 1.1776541560800873, + "learning_rate": 1.8544761622114396e-05, + "loss": 0.6055, + "step": 2561 + }, + { + "epoch": 0.2, + "grad_norm": 1.3578178870968904, + "learning_rate": 1.854345604337137e-05, + "loss": 0.6334, + "step": 2562 + }, + { + "epoch": 0.2, + "grad_norm": 1.1676885976947013, + "learning_rate": 1.8542149925235885e-05, + "loss": 0.5701, + "step": 2563 + }, + { + "epoch": 0.2, + "grad_norm": 1.2746782401745258, + "learning_rate": 1.8540843267790406e-05, + "loss": 0.6239, + "step": 2564 + }, + { + "epoch": 0.2, + "grad_norm": 1.1632292152382684, + "learning_rate": 1.8539536071117424e-05, + "loss": 0.5816, + "step": 2565 + }, + { + "epoch": 0.2, + "grad_norm": 1.1965253573232237, + "learning_rate": 1.853822833529947e-05, + "loss": 0.5795, + "step": 2566 + }, + { + "epoch": 0.2, + "grad_norm": 1.1052932869597845, + "learning_rate": 1.853692006041911e-05, + "loss": 0.589, + "step": 2567 + }, + { + "epoch": 0.2, + "grad_norm": 1.2843409679943978, + "learning_rate": 1.8535611246558947e-05, + "loss": 0.6422, + "step": 2568 + }, + { + "epoch": 0.2, + "grad_norm": 1.2733504844845251, + "learning_rate": 1.8534301893801606e-05, + "loss": 0.6569, + "step": 2569 + }, + { + "epoch": 0.2, + "grad_norm": 1.2016262776645703, + "learning_rate": 1.8532992002229755e-05, + "loss": 0.6028, + "step": 2570 + }, + { + "epoch": 0.2, + "grad_norm": 1.420376753324155, + "learning_rate": 1.853168157192609e-05, + "loss": 0.6932, + "step": 2571 + }, + { + "epoch": 0.2, + "grad_norm": 1.1569254165454455, + "learning_rate": 1.8530370602973356e-05, + "loss": 0.5704, + "step": 2572 + }, + { + "epoch": 0.2, + "grad_norm": 1.3356085924525085, + "learning_rate": 1.852905909545431e-05, + "loss": 0.6361, + "step": 2573 + }, + { + "epoch": 0.2, + "grad_norm": 1.1938464580132024, + "learning_rate": 1.8527747049451763e-05, + "loss": 0.6223, + "step": 2574 + }, + { + "epoch": 0.2, + "grad_norm": 1.2820819386766096, + "learning_rate": 1.8526434465048544e-05, + "loss": 0.6493, + "step": 2575 + }, + { + "epoch": 0.2, + "grad_norm": 1.2741980592508404, + "learning_rate": 1.852512134232753e-05, + "loss": 0.6682, + "step": 2576 + }, + { + "epoch": 0.2, + "grad_norm": 1.1854992123218688, + "learning_rate": 1.8523807681371622e-05, + "loss": 0.6324, + "step": 2577 + }, + { + "epoch": 0.2, + "grad_norm": 1.340573081765185, + "learning_rate": 1.8522493482263753e-05, + "loss": 0.6448, + "step": 2578 + }, + { + "epoch": 0.2, + "grad_norm": 1.214321993438132, + "learning_rate": 1.8521178745086906e-05, + "loss": 0.6302, + "step": 2579 + }, + { + "epoch": 0.2, + "grad_norm": 1.1534556051903955, + "learning_rate": 1.8519863469924078e-05, + "loss": 0.5599, + "step": 2580 + }, + { + "epoch": 0.2, + "grad_norm": 1.2511316422190768, + "learning_rate": 1.8518547656858314e-05, + "loss": 0.6031, + "step": 2581 + }, + { + "epoch": 0.2, + "grad_norm": 1.309353963459886, + "learning_rate": 1.8517231305972686e-05, + "loss": 0.6538, + "step": 2582 + }, + { + "epoch": 0.2, + "grad_norm": 1.1116357789807634, + "learning_rate": 1.85159144173503e-05, + "loss": 0.5839, + "step": 2583 + }, + { + "epoch": 0.2, + "grad_norm": 1.3377808463173022, + "learning_rate": 1.8514596991074305e-05, + "loss": 0.6171, + "step": 2584 + }, + { + "epoch": 0.2, + "grad_norm": 1.2714148530250609, + "learning_rate": 1.8513279027227868e-05, + "loss": 0.6274, + "step": 2585 + }, + { + "epoch": 0.2, + "grad_norm": 1.1845584878229116, + "learning_rate": 1.8511960525894207e-05, + "loss": 0.5645, + "step": 2586 + }, + { + "epoch": 0.2, + "grad_norm": 1.2666671135968541, + "learning_rate": 1.851064148715656e-05, + "loss": 0.6043, + "step": 2587 + }, + { + "epoch": 0.2, + "grad_norm": 1.1955396149731525, + "learning_rate": 1.8509321911098206e-05, + "loss": 0.5603, + "step": 2588 + }, + { + "epoch": 0.2, + "grad_norm": 1.3599628186051949, + "learning_rate": 1.850800179780246e-05, + "loss": 0.664, + "step": 2589 + }, + { + "epoch": 0.2, + "grad_norm": 1.1808500309106609, + "learning_rate": 1.8506681147352662e-05, + "loss": 0.5671, + "step": 2590 + }, + { + "epoch": 0.2, + "grad_norm": 1.1913907409659108, + "learning_rate": 1.8505359959832198e-05, + "loss": 0.6267, + "step": 2591 + }, + { + "epoch": 0.2, + "grad_norm": 1.2191746289889973, + "learning_rate": 1.8504038235324474e-05, + "loss": 0.644, + "step": 2592 + }, + { + "epoch": 0.2, + "grad_norm": 1.1284596969039795, + "learning_rate": 1.8502715973912942e-05, + "loss": 0.5789, + "step": 2593 + }, + { + "epoch": 0.2, + "grad_norm": 1.272040367154707, + "learning_rate": 1.8501393175681084e-05, + "loss": 0.6216, + "step": 2594 + }, + { + "epoch": 0.2, + "grad_norm": 1.2028242453648108, + "learning_rate": 1.8500069840712412e-05, + "loss": 0.5937, + "step": 2595 + }, + { + "epoch": 0.2, + "grad_norm": 1.1447960348743038, + "learning_rate": 1.849874596909048e-05, + "loss": 0.5663, + "step": 2596 + }, + { + "epoch": 0.2, + "grad_norm": 1.221488224093012, + "learning_rate": 1.849742156089886e-05, + "loss": 0.6107, + "step": 2597 + }, + { + "epoch": 0.2, + "grad_norm": 1.3577475616764714, + "learning_rate": 1.8496096616221182e-05, + "loss": 0.65, + "step": 2598 + }, + { + "epoch": 0.2, + "grad_norm": 1.2733652761373986, + "learning_rate": 1.8494771135141086e-05, + "loss": 0.5847, + "step": 2599 + }, + { + "epoch": 0.2, + "grad_norm": 1.3544261341449815, + "learning_rate": 1.8493445117742267e-05, + "loss": 0.7011, + "step": 2600 + }, + { + "epoch": 0.2, + "grad_norm": 1.2235235970750573, + "learning_rate": 1.8492118564108433e-05, + "loss": 0.6319, + "step": 2601 + }, + { + "epoch": 0.2, + "grad_norm": 1.2630315039534614, + "learning_rate": 1.849079147432334e-05, + "loss": 0.6244, + "step": 2602 + }, + { + "epoch": 0.2, + "grad_norm": 1.2065669280837659, + "learning_rate": 1.8489463848470777e-05, + "loss": 0.5747, + "step": 2603 + }, + { + "epoch": 0.2, + "grad_norm": 1.2684779096220387, + "learning_rate": 1.848813568663456e-05, + "loss": 0.6744, + "step": 2604 + }, + { + "epoch": 0.2, + "grad_norm": 1.2708911830471694, + "learning_rate": 1.8486806988898545e-05, + "loss": 0.6292, + "step": 2605 + }, + { + "epoch": 0.2, + "grad_norm": 1.387470954728476, + "learning_rate": 1.848547775534662e-05, + "loss": 0.6168, + "step": 2606 + }, + { + "epoch": 0.2, + "grad_norm": 1.21128894259088, + "learning_rate": 1.8484147986062702e-05, + "loss": 0.5725, + "step": 2607 + }, + { + "epoch": 0.2, + "grad_norm": 1.234802606614738, + "learning_rate": 1.8482817681130752e-05, + "loss": 0.6094, + "step": 2608 + }, + { + "epoch": 0.2, + "grad_norm": 1.2190169017705899, + "learning_rate": 1.8481486840634753e-05, + "loss": 0.6704, + "step": 2609 + }, + { + "epoch": 0.2, + "grad_norm": 1.258088123143036, + "learning_rate": 1.848015546465873e-05, + "loss": 0.6575, + "step": 2610 + }, + { + "epoch": 0.2, + "grad_norm": 1.215394756802738, + "learning_rate": 1.8478823553286745e-05, + "loss": 0.5818, + "step": 2611 + }, + { + "epoch": 0.2, + "grad_norm": 1.2669526189391536, + "learning_rate": 1.847749110660288e-05, + "loss": 0.6307, + "step": 2612 + }, + { + "epoch": 0.2, + "grad_norm": 1.091879143797114, + "learning_rate": 1.847615812469127e-05, + "loss": 0.571, + "step": 2613 + }, + { + "epoch": 0.2, + "grad_norm": 1.1244845269013442, + "learning_rate": 1.847482460763606e-05, + "loss": 0.5366, + "step": 2614 + }, + { + "epoch": 0.2, + "grad_norm": 1.1921238973507857, + "learning_rate": 1.847349055552145e-05, + "loss": 0.5979, + "step": 2615 + }, + { + "epoch": 0.2, + "grad_norm": 1.2410866039559778, + "learning_rate": 1.8472155968431664e-05, + "loss": 0.6689, + "step": 2616 + }, + { + "epoch": 0.2, + "grad_norm": 1.356489912690882, + "learning_rate": 1.847082084645096e-05, + "loss": 0.7003, + "step": 2617 + }, + { + "epoch": 0.2, + "grad_norm": 1.2204231124157234, + "learning_rate": 1.8469485189663635e-05, + "loss": 0.6102, + "step": 2618 + }, + { + "epoch": 0.2, + "grad_norm": 1.2619691015774441, + "learning_rate": 1.846814899815401e-05, + "loss": 0.6076, + "step": 2619 + }, + { + "epoch": 0.2, + "grad_norm": 1.1410021746194294, + "learning_rate": 1.846681227200645e-05, + "loss": 0.6001, + "step": 2620 + }, + { + "epoch": 0.2, + "grad_norm": 1.271208514921696, + "learning_rate": 1.846547501130535e-05, + "loss": 0.6767, + "step": 2621 + }, + { + "epoch": 0.2, + "grad_norm": 1.0982250219744067, + "learning_rate": 1.8464137216135133e-05, + "loss": 0.5341, + "step": 2622 + }, + { + "epoch": 0.2, + "grad_norm": 1.265605549603989, + "learning_rate": 1.846279888658027e-05, + "loss": 0.6375, + "step": 2623 + }, + { + "epoch": 0.2, + "grad_norm": 1.2173059173294758, + "learning_rate": 1.8461460022725247e-05, + "loss": 0.6639, + "step": 2624 + }, + { + "epoch": 0.2, + "grad_norm": 1.282403101392959, + "learning_rate": 1.8460120624654595e-05, + "loss": 0.5959, + "step": 2625 + }, + { + "epoch": 0.2, + "grad_norm": 1.2232820901783792, + "learning_rate": 1.8458780692452882e-05, + "loss": 0.6204, + "step": 2626 + }, + { + "epoch": 0.2, + "grad_norm": 1.2671190556152214, + "learning_rate": 1.8457440226204702e-05, + "loss": 0.6453, + "step": 2627 + }, + { + "epoch": 0.2, + "grad_norm": 1.2586802457027126, + "learning_rate": 1.8456099225994687e-05, + "loss": 0.6056, + "step": 2628 + }, + { + "epoch": 0.2, + "grad_norm": 1.2801579263984155, + "learning_rate": 1.84547576919075e-05, + "loss": 0.6207, + "step": 2629 + }, + { + "epoch": 0.2, + "grad_norm": 1.2125485931569533, + "learning_rate": 1.845341562402784e-05, + "loss": 0.6263, + "step": 2630 + }, + { + "epoch": 0.2, + "grad_norm": 1.2536828623449645, + "learning_rate": 1.8452073022440436e-05, + "loss": 0.6718, + "step": 2631 + }, + { + "epoch": 0.2, + "grad_norm": 1.29967642021547, + "learning_rate": 1.8450729887230053e-05, + "loss": 0.6593, + "step": 2632 + }, + { + "epoch": 0.2, + "grad_norm": 1.2974322856660658, + "learning_rate": 1.8449386218481495e-05, + "loss": 0.6644, + "step": 2633 + }, + { + "epoch": 0.2, + "grad_norm": 1.3146686348945504, + "learning_rate": 1.8448042016279592e-05, + "loss": 0.6526, + "step": 2634 + }, + { + "epoch": 0.2, + "grad_norm": 1.327386987620711, + "learning_rate": 1.844669728070921e-05, + "loss": 0.6544, + "step": 2635 + }, + { + "epoch": 0.2, + "grad_norm": 1.12063802036055, + "learning_rate": 1.844535201185525e-05, + "loss": 0.5968, + "step": 2636 + }, + { + "epoch": 0.2, + "grad_norm": 1.3718246294302483, + "learning_rate": 1.8444006209802646e-05, + "loss": 0.6414, + "step": 2637 + }, + { + "epoch": 0.2, + "grad_norm": 1.0747232258264159, + "learning_rate": 1.8442659874636365e-05, + "loss": 0.5564, + "step": 2638 + }, + { + "epoch": 0.2, + "grad_norm": 1.1955376705933867, + "learning_rate": 1.8441313006441402e-05, + "loss": 0.5549, + "step": 2639 + }, + { + "epoch": 0.2, + "grad_norm": 1.2364534191022152, + "learning_rate": 1.8439965605302803e-05, + "loss": 0.5558, + "step": 2640 + }, + { + "epoch": 0.2, + "grad_norm": 1.229463342111611, + "learning_rate": 1.843861767130563e-05, + "loss": 0.6207, + "step": 2641 + }, + { + "epoch": 0.2, + "grad_norm": 1.301853276592066, + "learning_rate": 1.8437269204534983e-05, + "loss": 0.6566, + "step": 2642 + }, + { + "epoch": 0.21, + "grad_norm": 1.0536674370328925, + "learning_rate": 1.8435920205076003e-05, + "loss": 0.5349, + "step": 2643 + }, + { + "epoch": 0.21, + "grad_norm": 1.2264069379575435, + "learning_rate": 1.843457067301386e-05, + "loss": 0.6085, + "step": 2644 + }, + { + "epoch": 0.21, + "grad_norm": 1.3700773813850702, + "learning_rate": 1.8433220608433747e-05, + "loss": 0.6298, + "step": 2645 + }, + { + "epoch": 0.21, + "grad_norm": 1.3546180804108368, + "learning_rate": 1.8431870011420908e-05, + "loss": 0.6848, + "step": 2646 + }, + { + "epoch": 0.21, + "grad_norm": 1.2270667624960336, + "learning_rate": 1.8430518882060618e-05, + "loss": 0.6283, + "step": 2647 + }, + { + "epoch": 0.21, + "grad_norm": 1.2408093178608892, + "learning_rate": 1.8429167220438172e-05, + "loss": 0.6122, + "step": 2648 + }, + { + "epoch": 0.21, + "grad_norm": 1.224188676086163, + "learning_rate": 1.842781502663891e-05, + "loss": 0.6537, + "step": 2649 + }, + { + "epoch": 0.21, + "grad_norm": 1.2322450925141506, + "learning_rate": 1.8426462300748206e-05, + "loss": 0.6884, + "step": 2650 + }, + { + "epoch": 0.21, + "grad_norm": 1.2752650901621032, + "learning_rate": 1.842510904285146e-05, + "loss": 0.6021, + "step": 2651 + }, + { + "epoch": 0.21, + "grad_norm": 1.227564942687233, + "learning_rate": 1.8423755253034113e-05, + "loss": 0.6353, + "step": 2652 + }, + { + "epoch": 0.21, + "grad_norm": 1.2917440093890669, + "learning_rate": 1.8422400931381635e-05, + "loss": 0.6622, + "step": 2653 + }, + { + "epoch": 0.21, + "grad_norm": 1.3524081721131431, + "learning_rate": 1.8421046077979535e-05, + "loss": 0.6717, + "step": 2654 + }, + { + "epoch": 0.21, + "grad_norm": 1.243483916531045, + "learning_rate": 1.8419690692913346e-05, + "loss": 0.6144, + "step": 2655 + }, + { + "epoch": 0.21, + "grad_norm": 1.332726126730903, + "learning_rate": 1.8418334776268643e-05, + "loss": 0.6453, + "step": 2656 + }, + { + "epoch": 0.21, + "grad_norm": 1.246272202870776, + "learning_rate": 1.8416978328131036e-05, + "loss": 0.6248, + "step": 2657 + }, + { + "epoch": 0.21, + "grad_norm": 1.239141604579045, + "learning_rate": 1.841562134858616e-05, + "loss": 0.6275, + "step": 2658 + }, + { + "epoch": 0.21, + "grad_norm": 1.2189441917449733, + "learning_rate": 1.8414263837719688e-05, + "loss": 0.5938, + "step": 2659 + }, + { + "epoch": 0.21, + "grad_norm": 1.2622618554492198, + "learning_rate": 1.8412905795617327e-05, + "loss": 0.6258, + "step": 2660 + }, + { + "epoch": 0.21, + "grad_norm": 1.2496346416584134, + "learning_rate": 1.8411547222364824e-05, + "loss": 0.6185, + "step": 2661 + }, + { + "epoch": 0.21, + "grad_norm": 1.2873301569937172, + "learning_rate": 1.8410188118047942e-05, + "loss": 0.6237, + "step": 2662 + }, + { + "epoch": 0.21, + "grad_norm": 1.1988220055586474, + "learning_rate": 1.8408828482752495e-05, + "loss": 0.6378, + "step": 2663 + }, + { + "epoch": 0.21, + "grad_norm": 1.2618972132457877, + "learning_rate": 1.8407468316564322e-05, + "loss": 0.6317, + "step": 2664 + }, + { + "epoch": 0.21, + "grad_norm": 1.140429022688034, + "learning_rate": 1.84061076195693e-05, + "loss": 0.6108, + "step": 2665 + }, + { + "epoch": 0.21, + "grad_norm": 1.221287262581384, + "learning_rate": 1.8404746391853328e-05, + "loss": 0.6259, + "step": 2666 + }, + { + "epoch": 0.21, + "grad_norm": 1.2791333276005448, + "learning_rate": 1.840338463350236e-05, + "loss": 0.6019, + "step": 2667 + }, + { + "epoch": 0.21, + "grad_norm": 1.307468078179842, + "learning_rate": 1.840202234460236e-05, + "loss": 0.6018, + "step": 2668 + }, + { + "epoch": 0.21, + "grad_norm": 1.222592483173903, + "learning_rate": 1.8400659525239343e-05, + "loss": 0.6686, + "step": 2669 + }, + { + "epoch": 0.21, + "grad_norm": 1.2743021364134406, + "learning_rate": 1.8399296175499346e-05, + "loss": 0.6374, + "step": 2670 + }, + { + "epoch": 0.21, + "grad_norm": 1.0510973827060548, + "learning_rate": 1.8397932295468446e-05, + "loss": 0.5676, + "step": 2671 + }, + { + "epoch": 0.21, + "grad_norm": 1.2944416979777424, + "learning_rate": 1.8396567885232753e-05, + "loss": 0.6113, + "step": 2672 + }, + { + "epoch": 0.21, + "grad_norm": 1.1775259969153802, + "learning_rate": 1.839520294487841e-05, + "loss": 0.6515, + "step": 2673 + }, + { + "epoch": 0.21, + "grad_norm": 1.3056745621480876, + "learning_rate": 1.8393837474491595e-05, + "loss": 0.6288, + "step": 2674 + }, + { + "epoch": 0.21, + "grad_norm": 1.243677505220659, + "learning_rate": 1.839247147415851e-05, + "loss": 0.6424, + "step": 2675 + }, + { + "epoch": 0.21, + "grad_norm": 1.3332101298074706, + "learning_rate": 1.83911049439654e-05, + "loss": 0.7071, + "step": 2676 + }, + { + "epoch": 0.21, + "grad_norm": 1.1380195646766325, + "learning_rate": 1.8389737883998545e-05, + "loss": 0.6372, + "step": 2677 + }, + { + "epoch": 0.21, + "grad_norm": 1.2390205751424068, + "learning_rate": 1.838837029434425e-05, + "loss": 0.6337, + "step": 2678 + }, + { + "epoch": 0.21, + "grad_norm": 1.4461415155684236, + "learning_rate": 1.8387002175088863e-05, + "loss": 0.6239, + "step": 2679 + }, + { + "epoch": 0.21, + "grad_norm": 1.2419222662521607, + "learning_rate": 1.8385633526318758e-05, + "loss": 0.624, + "step": 2680 + }, + { + "epoch": 0.21, + "grad_norm": 1.1862414868836182, + "learning_rate": 1.838426434812034e-05, + "loss": 0.6165, + "step": 2681 + }, + { + "epoch": 0.21, + "grad_norm": 1.2945934586895842, + "learning_rate": 1.838289464058006e-05, + "loss": 0.6867, + "step": 2682 + }, + { + "epoch": 0.21, + "grad_norm": 1.2284431375487235, + "learning_rate": 1.838152440378439e-05, + "loss": 0.6384, + "step": 2683 + }, + { + "epoch": 0.21, + "grad_norm": 1.261263502443776, + "learning_rate": 1.8380153637819843e-05, + "loss": 0.6266, + "step": 2684 + }, + { + "epoch": 0.21, + "grad_norm": 1.1933221639522478, + "learning_rate": 1.8378782342772958e-05, + "loss": 0.5394, + "step": 2685 + }, + { + "epoch": 0.21, + "grad_norm": 1.1678693339926849, + "learning_rate": 1.8377410518730318e-05, + "loss": 0.5672, + "step": 2686 + }, + { + "epoch": 0.21, + "grad_norm": 1.2024879255104957, + "learning_rate": 1.8376038165778526e-05, + "loss": 0.6657, + "step": 2687 + }, + { + "epoch": 0.21, + "grad_norm": 1.1051933565602114, + "learning_rate": 1.8374665284004235e-05, + "loss": 0.5627, + "step": 2688 + }, + { + "epoch": 0.21, + "grad_norm": 1.2497066153504524, + "learning_rate": 1.8373291873494112e-05, + "loss": 0.6754, + "step": 2689 + }, + { + "epoch": 0.21, + "grad_norm": 1.2209087717402778, + "learning_rate": 1.8371917934334877e-05, + "loss": 0.6259, + "step": 2690 + }, + { + "epoch": 0.21, + "grad_norm": 1.2217406235583503, + "learning_rate": 1.8370543466613267e-05, + "loss": 0.5533, + "step": 2691 + }, + { + "epoch": 0.21, + "grad_norm": 1.0858125683391366, + "learning_rate": 1.8369168470416064e-05, + "loss": 0.5388, + "step": 2692 + }, + { + "epoch": 0.21, + "grad_norm": 1.2870784865798823, + "learning_rate": 1.8367792945830073e-05, + "loss": 0.7085, + "step": 2693 + }, + { + "epoch": 0.21, + "grad_norm": 1.1734353422305082, + "learning_rate": 1.836641689294214e-05, + "loss": 0.6266, + "step": 2694 + }, + { + "epoch": 0.21, + "grad_norm": 1.3411617191694807, + "learning_rate": 1.8365040311839148e-05, + "loss": 0.618, + "step": 2695 + }, + { + "epoch": 0.21, + "grad_norm": 1.3621934633415456, + "learning_rate": 1.8363663202608e-05, + "loss": 0.6862, + "step": 2696 + }, + { + "epoch": 0.21, + "grad_norm": 1.2500355238635081, + "learning_rate": 1.8362285565335645e-05, + "loss": 0.618, + "step": 2697 + }, + { + "epoch": 0.21, + "grad_norm": 1.3374980712591507, + "learning_rate": 1.836090740010906e-05, + "loss": 0.6414, + "step": 2698 + }, + { + "epoch": 0.21, + "grad_norm": 1.3028346170408762, + "learning_rate": 1.835952870701525e-05, + "loss": 0.6513, + "step": 2699 + }, + { + "epoch": 0.21, + "grad_norm": 1.2482191274390388, + "learning_rate": 1.8358149486141265e-05, + "loss": 0.6213, + "step": 2700 + }, + { + "epoch": 0.21, + "grad_norm": 1.2262618552907405, + "learning_rate": 1.835676973757418e-05, + "loss": 0.5418, + "step": 2701 + }, + { + "epoch": 0.21, + "grad_norm": 1.1754886098266861, + "learning_rate": 1.835538946140111e-05, + "loss": 0.5945, + "step": 2702 + }, + { + "epoch": 0.21, + "grad_norm": 1.2011351541007982, + "learning_rate": 1.835400865770919e-05, + "loss": 0.6356, + "step": 2703 + }, + { + "epoch": 0.21, + "grad_norm": 1.258339623357618, + "learning_rate": 1.8352627326585606e-05, + "loss": 0.6318, + "step": 2704 + }, + { + "epoch": 0.21, + "grad_norm": 1.2204543203473468, + "learning_rate": 1.8351245468117564e-05, + "loss": 0.6116, + "step": 2705 + }, + { + "epoch": 0.21, + "grad_norm": 1.1386343424714571, + "learning_rate": 1.834986308239231e-05, + "loss": 0.686, + "step": 2706 + }, + { + "epoch": 0.21, + "grad_norm": 1.1711072822995203, + "learning_rate": 1.8348480169497114e-05, + "loss": 0.573, + "step": 2707 + }, + { + "epoch": 0.21, + "grad_norm": 1.1923948595092952, + "learning_rate": 1.8347096729519298e-05, + "loss": 0.5769, + "step": 2708 + }, + { + "epoch": 0.21, + "grad_norm": 1.2338145951634476, + "learning_rate": 1.83457127625462e-05, + "loss": 0.5593, + "step": 2709 + }, + { + "epoch": 0.21, + "grad_norm": 1.3051236731291675, + "learning_rate": 1.8344328268665193e-05, + "loss": 0.6239, + "step": 2710 + }, + { + "epoch": 0.21, + "grad_norm": 1.3311047506118519, + "learning_rate": 1.83429432479637e-05, + "loss": 0.632, + "step": 2711 + }, + { + "epoch": 0.21, + "grad_norm": 1.278516178847769, + "learning_rate": 1.834155770052915e-05, + "loss": 0.5806, + "step": 2712 + }, + { + "epoch": 0.21, + "grad_norm": 1.323828580647444, + "learning_rate": 1.8340171626449025e-05, + "loss": 0.5996, + "step": 2713 + }, + { + "epoch": 0.21, + "grad_norm": 1.1035350696572261, + "learning_rate": 1.8338785025810836e-05, + "loss": 0.5794, + "step": 2714 + }, + { + "epoch": 0.21, + "grad_norm": 1.3069138405207896, + "learning_rate": 1.8337397898702127e-05, + "loss": 0.6397, + "step": 2715 + }, + { + "epoch": 0.21, + "grad_norm": 1.2692549184689246, + "learning_rate": 1.833601024521047e-05, + "loss": 0.5903, + "step": 2716 + }, + { + "epoch": 0.21, + "grad_norm": 1.2384900900729578, + "learning_rate": 1.8334622065423482e-05, + "loss": 0.6285, + "step": 2717 + }, + { + "epoch": 0.21, + "grad_norm": 1.11634362634413, + "learning_rate": 1.8333233359428798e-05, + "loss": 0.6394, + "step": 2718 + }, + { + "epoch": 0.21, + "grad_norm": 1.2987121530255736, + "learning_rate": 1.8331844127314104e-05, + "loss": 0.6591, + "step": 2719 + }, + { + "epoch": 0.21, + "grad_norm": 1.1814799554216615, + "learning_rate": 1.83304543691671e-05, + "loss": 0.588, + "step": 2720 + }, + { + "epoch": 0.21, + "grad_norm": 1.3095366767358483, + "learning_rate": 1.832906408507553e-05, + "loss": 0.6206, + "step": 2721 + }, + { + "epoch": 0.21, + "grad_norm": 1.314702864752759, + "learning_rate": 1.8327673275127177e-05, + "loss": 0.669, + "step": 2722 + }, + { + "epoch": 0.21, + "grad_norm": 1.191442370258049, + "learning_rate": 1.832628193940984e-05, + "loss": 0.6409, + "step": 2723 + }, + { + "epoch": 0.21, + "grad_norm": 1.3098278091190558, + "learning_rate": 1.832489007801137e-05, + "loss": 0.6143, + "step": 2724 + }, + { + "epoch": 0.21, + "grad_norm": 1.1939614832622507, + "learning_rate": 1.832349769101964e-05, + "loss": 0.5463, + "step": 2725 + }, + { + "epoch": 0.21, + "grad_norm": 1.284174071325053, + "learning_rate": 1.832210477852255e-05, + "loss": 0.6731, + "step": 2726 + }, + { + "epoch": 0.21, + "grad_norm": 1.1573902384579537, + "learning_rate": 1.8320711340608058e-05, + "loss": 0.6316, + "step": 2727 + }, + { + "epoch": 0.21, + "grad_norm": 1.210314633549534, + "learning_rate": 1.8319317377364124e-05, + "loss": 0.6184, + "step": 2728 + }, + { + "epoch": 0.21, + "grad_norm": 1.216356519247249, + "learning_rate": 1.8317922888878763e-05, + "loss": 0.6378, + "step": 2729 + }, + { + "epoch": 0.21, + "grad_norm": 1.2886793549763733, + "learning_rate": 1.8316527875240016e-05, + "loss": 0.5714, + "step": 2730 + }, + { + "epoch": 0.21, + "grad_norm": 1.1853238293453052, + "learning_rate": 1.8315132336535956e-05, + "loss": 0.5943, + "step": 2731 + }, + { + "epoch": 0.21, + "grad_norm": 1.4009815283516742, + "learning_rate": 1.8313736272854692e-05, + "loss": 0.5872, + "step": 2732 + }, + { + "epoch": 0.21, + "grad_norm": 1.3185053991757636, + "learning_rate": 1.8312339684284363e-05, + "loss": 0.7169, + "step": 2733 + }, + { + "epoch": 0.21, + "grad_norm": 1.1847111931889096, + "learning_rate": 1.831094257091314e-05, + "loss": 0.6049, + "step": 2734 + }, + { + "epoch": 0.21, + "grad_norm": 1.2837781851403587, + "learning_rate": 1.8309544932829238e-05, + "loss": 0.6198, + "step": 2735 + }, + { + "epoch": 0.21, + "grad_norm": 1.2952410964703984, + "learning_rate": 1.8308146770120893e-05, + "loss": 0.6702, + "step": 2736 + }, + { + "epoch": 0.21, + "grad_norm": 1.3519706771227036, + "learning_rate": 1.8306748082876377e-05, + "loss": 0.6492, + "step": 2737 + }, + { + "epoch": 0.21, + "grad_norm": 1.2180031664514333, + "learning_rate": 1.8305348871183998e-05, + "loss": 0.6082, + "step": 2738 + }, + { + "epoch": 0.21, + "grad_norm": 1.2314869376762534, + "learning_rate": 1.8303949135132096e-05, + "loss": 0.6197, + "step": 2739 + }, + { + "epoch": 0.21, + "grad_norm": 1.2433844982968294, + "learning_rate": 1.8302548874809042e-05, + "loss": 0.6288, + "step": 2740 + }, + { + "epoch": 0.21, + "grad_norm": 1.209795357841027, + "learning_rate": 1.8301148090303238e-05, + "loss": 0.625, + "step": 2741 + }, + { + "epoch": 0.21, + "grad_norm": 1.0811275192303724, + "learning_rate": 1.829974678170313e-05, + "loss": 0.5735, + "step": 2742 + }, + { + "epoch": 0.21, + "grad_norm": 1.1913268516224023, + "learning_rate": 1.8298344949097186e-05, + "loss": 0.5761, + "step": 2743 + }, + { + "epoch": 0.21, + "grad_norm": 1.2868632195532392, + "learning_rate": 1.8296942592573912e-05, + "loss": 0.5971, + "step": 2744 + }, + { + "epoch": 0.21, + "grad_norm": 1.2610298378187574, + "learning_rate": 1.8295539712221844e-05, + "loss": 0.5866, + "step": 2745 + }, + { + "epoch": 0.21, + "grad_norm": 1.14555682689655, + "learning_rate": 1.8294136308129556e-05, + "loss": 0.6317, + "step": 2746 + }, + { + "epoch": 0.21, + "grad_norm": 1.1480329151219886, + "learning_rate": 1.829273238038565e-05, + "loss": 0.6175, + "step": 2747 + }, + { + "epoch": 0.21, + "grad_norm": 1.3364857223610094, + "learning_rate": 1.8291327929078764e-05, + "loss": 0.6465, + "step": 2748 + }, + { + "epoch": 0.21, + "grad_norm": 1.218675855679861, + "learning_rate": 1.828992295429757e-05, + "loss": 0.5915, + "step": 2749 + }, + { + "epoch": 0.21, + "grad_norm": 1.069779256261487, + "learning_rate": 1.828851745613077e-05, + "loss": 0.5412, + "step": 2750 + }, + { + "epoch": 0.21, + "grad_norm": 1.2703041892817974, + "learning_rate": 1.8287111434667096e-05, + "loss": 0.598, + "step": 2751 + }, + { + "epoch": 0.21, + "grad_norm": 1.3296854613094637, + "learning_rate": 1.828570488999532e-05, + "loss": 0.6106, + "step": 2752 + }, + { + "epoch": 0.21, + "grad_norm": 1.2935112682245427, + "learning_rate": 1.8284297822204252e-05, + "loss": 0.6521, + "step": 2753 + }, + { + "epoch": 0.21, + "grad_norm": 1.323029876230364, + "learning_rate": 1.8282890231382716e-05, + "loss": 0.6468, + "step": 2754 + }, + { + "epoch": 0.21, + "grad_norm": 1.2037016242300043, + "learning_rate": 1.828148211761959e-05, + "loss": 0.6173, + "step": 2755 + }, + { + "epoch": 0.21, + "grad_norm": 1.2792445514258015, + "learning_rate": 1.828007348100377e-05, + "loss": 0.6499, + "step": 2756 + }, + { + "epoch": 0.21, + "grad_norm": 1.3389547084302056, + "learning_rate": 1.827866432162419e-05, + "loss": 0.6689, + "step": 2757 + }, + { + "epoch": 0.21, + "grad_norm": 1.187141063050096, + "learning_rate": 1.8277254639569817e-05, + "loss": 0.6608, + "step": 2758 + }, + { + "epoch": 0.21, + "grad_norm": 1.3522860844756028, + "learning_rate": 1.827584443492966e-05, + "loss": 0.5747, + "step": 2759 + }, + { + "epoch": 0.21, + "grad_norm": 1.2429242138703271, + "learning_rate": 1.827443370779274e-05, + "loss": 0.669, + "step": 2760 + }, + { + "epoch": 0.21, + "grad_norm": 1.1528596919165448, + "learning_rate": 1.827302245824813e-05, + "loss": 0.5587, + "step": 2761 + }, + { + "epoch": 0.21, + "grad_norm": 1.2648710194900268, + "learning_rate": 1.827161068638493e-05, + "loss": 0.6612, + "step": 2762 + }, + { + "epoch": 0.21, + "grad_norm": 1.3514441289498784, + "learning_rate": 1.8270198392292276e-05, + "loss": 0.7449, + "step": 2763 + }, + { + "epoch": 0.21, + "grad_norm": 1.1608384820495767, + "learning_rate": 1.8268785576059324e-05, + "loss": 0.5977, + "step": 2764 + }, + { + "epoch": 0.21, + "grad_norm": 1.3438503760174543, + "learning_rate": 1.8267372237775278e-05, + "loss": 0.6776, + "step": 2765 + }, + { + "epoch": 0.21, + "grad_norm": 1.1346874111619902, + "learning_rate": 1.826595837752937e-05, + "loss": 0.613, + "step": 2766 + }, + { + "epoch": 0.21, + "grad_norm": 1.2276459786188239, + "learning_rate": 1.826454399541086e-05, + "loss": 0.6094, + "step": 2767 + }, + { + "epoch": 0.21, + "grad_norm": 1.1437264695378795, + "learning_rate": 1.826312909150905e-05, + "loss": 0.5423, + "step": 2768 + }, + { + "epoch": 0.21, + "grad_norm": 1.1743204667939153, + "learning_rate": 1.8261713665913268e-05, + "loss": 0.6243, + "step": 2769 + }, + { + "epoch": 0.21, + "grad_norm": 1.3523323644801177, + "learning_rate": 1.8260297718712877e-05, + "loss": 0.6125, + "step": 2770 + }, + { + "epoch": 0.21, + "grad_norm": 1.23234212046008, + "learning_rate": 1.8258881249997275e-05, + "loss": 0.6303, + "step": 2771 + }, + { + "epoch": 0.22, + "grad_norm": 1.2842448983116175, + "learning_rate": 1.825746425985589e-05, + "loss": 0.6576, + "step": 2772 + }, + { + "epoch": 0.22, + "grad_norm": 1.4215279459532615, + "learning_rate": 1.825604674837818e-05, + "loss": 0.658, + "step": 2773 + }, + { + "epoch": 0.22, + "grad_norm": 1.340826047036285, + "learning_rate": 1.8254628715653647e-05, + "loss": 0.6525, + "step": 2774 + }, + { + "epoch": 0.22, + "grad_norm": 1.1641832839464328, + "learning_rate": 1.825321016177181e-05, + "loss": 0.6072, + "step": 2775 + }, + { + "epoch": 0.22, + "grad_norm": 1.2655579820008127, + "learning_rate": 1.825179108682224e-05, + "loss": 0.663, + "step": 2776 + }, + { + "epoch": 0.22, + "grad_norm": 1.2396023318760847, + "learning_rate": 1.8250371490894524e-05, + "loss": 0.6118, + "step": 2777 + }, + { + "epoch": 0.22, + "grad_norm": 1.0523134426858805, + "learning_rate": 1.8248951374078288e-05, + "loss": 0.577, + "step": 2778 + }, + { + "epoch": 0.22, + "grad_norm": 1.1822575243530766, + "learning_rate": 1.8247530736463194e-05, + "loss": 0.6171, + "step": 2779 + }, + { + "epoch": 0.22, + "grad_norm": 1.3799827199351595, + "learning_rate": 1.8246109578138932e-05, + "loss": 0.6211, + "step": 2780 + }, + { + "epoch": 0.22, + "grad_norm": 1.223821407802544, + "learning_rate": 1.8244687899195224e-05, + "loss": 0.651, + "step": 2781 + }, + { + "epoch": 0.22, + "grad_norm": 1.1692815948648125, + "learning_rate": 1.824326569972184e-05, + "loss": 0.5911, + "step": 2782 + }, + { + "epoch": 0.22, + "grad_norm": 1.267942778767351, + "learning_rate": 1.8241842979808554e-05, + "loss": 0.6236, + "step": 2783 + }, + { + "epoch": 0.22, + "grad_norm": 1.221052537882822, + "learning_rate": 1.8240419739545206e-05, + "loss": 0.578, + "step": 2784 + }, + { + "epoch": 0.22, + "grad_norm": 1.2035246841599823, + "learning_rate": 1.8238995979021642e-05, + "loss": 0.5699, + "step": 2785 + }, + { + "epoch": 0.22, + "grad_norm": 1.2816396911487473, + "learning_rate": 1.8237571698327752e-05, + "loss": 0.5972, + "step": 2786 + }, + { + "epoch": 0.22, + "grad_norm": 1.2643952222970967, + "learning_rate": 1.8236146897553464e-05, + "loss": 0.6583, + "step": 2787 + }, + { + "epoch": 0.22, + "grad_norm": 1.3673255632716947, + "learning_rate": 1.8234721576788726e-05, + "loss": 0.6039, + "step": 2788 + }, + { + "epoch": 0.22, + "grad_norm": 1.2348573442454402, + "learning_rate": 1.823329573612353e-05, + "loss": 0.6543, + "step": 2789 + }, + { + "epoch": 0.22, + "grad_norm": 1.1745171325817638, + "learning_rate": 1.8231869375647897e-05, + "loss": 0.565, + "step": 2790 + }, + { + "epoch": 0.22, + "grad_norm": 1.2424715784917095, + "learning_rate": 1.823044249545188e-05, + "loss": 0.6605, + "step": 2791 + }, + { + "epoch": 0.22, + "grad_norm": 1.2302056409702453, + "learning_rate": 1.8229015095625567e-05, + "loss": 0.6465, + "step": 2792 + }, + { + "epoch": 0.22, + "grad_norm": 1.252995478150778, + "learning_rate": 1.822758717625907e-05, + "loss": 0.6635, + "step": 2793 + }, + { + "epoch": 0.22, + "grad_norm": 1.2159450237244342, + "learning_rate": 1.8226158737442547e-05, + "loss": 0.6204, + "step": 2794 + }, + { + "epoch": 0.22, + "grad_norm": 1.1720020988204702, + "learning_rate": 1.8224729779266183e-05, + "loss": 0.5526, + "step": 2795 + }, + { + "epoch": 0.22, + "grad_norm": 1.1946464627716764, + "learning_rate": 1.8223300301820193e-05, + "loss": 0.6274, + "step": 2796 + }, + { + "epoch": 0.22, + "grad_norm": 1.1868871311975935, + "learning_rate": 1.8221870305194828e-05, + "loss": 0.5913, + "step": 2797 + }, + { + "epoch": 0.22, + "grad_norm": 1.3490003434443054, + "learning_rate": 1.8220439789480375e-05, + "loss": 0.6588, + "step": 2798 + }, + { + "epoch": 0.22, + "grad_norm": 1.2190805622670569, + "learning_rate": 1.8219008754767144e-05, + "loss": 0.6279, + "step": 2799 + }, + { + "epoch": 0.22, + "grad_norm": 1.3299475786447579, + "learning_rate": 1.8217577201145483e-05, + "loss": 0.6658, + "step": 2800 + }, + { + "epoch": 0.22, + "grad_norm": 1.1969145467016507, + "learning_rate": 1.821614512870578e-05, + "loss": 0.6126, + "step": 2801 + }, + { + "epoch": 0.22, + "grad_norm": 1.1769735684599172, + "learning_rate": 1.821471253753844e-05, + "loss": 0.6276, + "step": 2802 + }, + { + "epoch": 0.22, + "grad_norm": 1.4148715744779188, + "learning_rate": 1.821327942773392e-05, + "loss": 0.6855, + "step": 2803 + }, + { + "epoch": 0.22, + "grad_norm": 1.3296521999097788, + "learning_rate": 1.8211845799382693e-05, + "loss": 0.6663, + "step": 2804 + }, + { + "epoch": 0.22, + "grad_norm": 1.2144700239054251, + "learning_rate": 1.8210411652575275e-05, + "loss": 0.6591, + "step": 2805 + }, + { + "epoch": 0.22, + "grad_norm": 1.3841934205573372, + "learning_rate": 1.8208976987402206e-05, + "loss": 0.6926, + "step": 2806 + }, + { + "epoch": 0.22, + "grad_norm": 1.2289015219973962, + "learning_rate": 1.8207541803954068e-05, + "loss": 0.6341, + "step": 2807 + }, + { + "epoch": 0.22, + "grad_norm": 1.2868690555800195, + "learning_rate": 1.8206106102321473e-05, + "loss": 0.5873, + "step": 2808 + }, + { + "epoch": 0.22, + "grad_norm": 1.2454855460021577, + "learning_rate": 1.820466988259506e-05, + "loss": 0.6953, + "step": 2809 + }, + { + "epoch": 0.22, + "grad_norm": 1.0982308292361256, + "learning_rate": 1.820323314486551e-05, + "loss": 0.5464, + "step": 2810 + }, + { + "epoch": 0.22, + "grad_norm": 1.1006471875806687, + "learning_rate": 1.8201795889223525e-05, + "loss": 0.5914, + "step": 2811 + }, + { + "epoch": 0.22, + "grad_norm": 1.2006455671781824, + "learning_rate": 1.8200358115759847e-05, + "loss": 0.5893, + "step": 2812 + }, + { + "epoch": 0.22, + "grad_norm": 1.3924243301706416, + "learning_rate": 1.8198919824565258e-05, + "loss": 0.6627, + "step": 2813 + }, + { + "epoch": 0.22, + "grad_norm": 1.4486124667564348, + "learning_rate": 1.8197481015730555e-05, + "loss": 0.6678, + "step": 2814 + }, + { + "epoch": 0.22, + "grad_norm": 1.2937493402599183, + "learning_rate": 1.8196041689346585e-05, + "loss": 0.6645, + "step": 2815 + }, + { + "epoch": 0.22, + "grad_norm": 1.3723180363975969, + "learning_rate": 1.819460184550422e-05, + "loss": 0.6615, + "step": 2816 + }, + { + "epoch": 0.22, + "grad_norm": 1.2960667676916562, + "learning_rate": 1.8193161484294358e-05, + "loss": 0.6251, + "step": 2817 + }, + { + "epoch": 0.22, + "grad_norm": 1.2350914240214055, + "learning_rate": 1.8191720605807937e-05, + "loss": 0.621, + "step": 2818 + }, + { + "epoch": 0.22, + "grad_norm": 1.1791714177299912, + "learning_rate": 1.819027921013594e-05, + "loss": 0.5851, + "step": 2819 + }, + { + "epoch": 0.22, + "grad_norm": 1.4171525271641732, + "learning_rate": 1.8188837297369353e-05, + "loss": 0.6497, + "step": 2820 + }, + { + "epoch": 0.22, + "grad_norm": 1.250284305665552, + "learning_rate": 1.818739486759922e-05, + "loss": 0.6881, + "step": 2821 + }, + { + "epoch": 0.22, + "grad_norm": 1.3425361118396884, + "learning_rate": 1.8185951920916607e-05, + "loss": 0.655, + "step": 2822 + }, + { + "epoch": 0.22, + "grad_norm": 1.2578889337856087, + "learning_rate": 1.8184508457412615e-05, + "loss": 0.6892, + "step": 2823 + }, + { + "epoch": 0.22, + "grad_norm": 1.2068249177344736, + "learning_rate": 1.818306447717838e-05, + "loss": 0.6273, + "step": 2824 + }, + { + "epoch": 0.22, + "grad_norm": 1.2200117182559702, + "learning_rate": 1.8181619980305065e-05, + "loss": 0.5969, + "step": 2825 + }, + { + "epoch": 0.22, + "grad_norm": 1.2984507076909155, + "learning_rate": 1.818017496688387e-05, + "loss": 0.5938, + "step": 2826 + }, + { + "epoch": 0.22, + "grad_norm": 1.2019157277396073, + "learning_rate": 1.817872943700602e-05, + "loss": 0.6362, + "step": 2827 + }, + { + "epoch": 0.22, + "grad_norm": 1.2322419967809393, + "learning_rate": 1.817728339076279e-05, + "loss": 0.6147, + "step": 2828 + }, + { + "epoch": 0.22, + "grad_norm": 1.356758493475954, + "learning_rate": 1.8175836828245467e-05, + "loss": 0.6504, + "step": 2829 + }, + { + "epoch": 0.22, + "grad_norm": 1.2897517817870001, + "learning_rate": 1.8174389749545384e-05, + "loss": 0.7105, + "step": 2830 + }, + { + "epoch": 0.22, + "grad_norm": 1.2157061297255327, + "learning_rate": 1.81729421547539e-05, + "loss": 0.6304, + "step": 2831 + }, + { + "epoch": 0.22, + "grad_norm": 1.2331171024493035, + "learning_rate": 1.8171494043962413e-05, + "loss": 0.6363, + "step": 2832 + }, + { + "epoch": 0.22, + "grad_norm": 1.2129381941406479, + "learning_rate": 1.817004541726235e-05, + "loss": 0.6248, + "step": 2833 + }, + { + "epoch": 0.22, + "grad_norm": 1.3303364462262077, + "learning_rate": 1.8168596274745165e-05, + "loss": 0.6512, + "step": 2834 + }, + { + "epoch": 0.22, + "grad_norm": 1.1810256240319497, + "learning_rate": 1.8167146616502353e-05, + "loss": 0.6057, + "step": 2835 + }, + { + "epoch": 0.22, + "grad_norm": 1.2015725560753407, + "learning_rate": 1.816569644262544e-05, + "loss": 0.6193, + "step": 2836 + }, + { + "epoch": 0.22, + "grad_norm": 1.1208913268126142, + "learning_rate": 1.8164245753205977e-05, + "loss": 0.5856, + "step": 2837 + }, + { + "epoch": 0.22, + "grad_norm": 1.1176206909308237, + "learning_rate": 1.816279454833556e-05, + "loss": 0.5683, + "step": 2838 + }, + { + "epoch": 0.22, + "grad_norm": 1.3175210731746485, + "learning_rate": 1.8161342828105806e-05, + "loss": 0.6368, + "step": 2839 + }, + { + "epoch": 0.22, + "grad_norm": 1.1667704479288525, + "learning_rate": 1.8159890592608378e-05, + "loss": 0.5324, + "step": 2840 + }, + { + "epoch": 0.22, + "grad_norm": 1.0988289212991194, + "learning_rate": 1.8158437841934954e-05, + "loss": 0.5394, + "step": 2841 + }, + { + "epoch": 0.22, + "grad_norm": 1.2842564549000774, + "learning_rate": 1.8156984576177258e-05, + "loss": 0.6238, + "step": 2842 + }, + { + "epoch": 0.22, + "grad_norm": 1.2009700450512546, + "learning_rate": 1.815553079542704e-05, + "loss": 0.6667, + "step": 2843 + }, + { + "epoch": 0.22, + "grad_norm": 1.341760404443746, + "learning_rate": 1.8154076499776087e-05, + "loss": 0.6925, + "step": 2844 + }, + { + "epoch": 0.22, + "grad_norm": 1.303661834134969, + "learning_rate": 1.8152621689316216e-05, + "loss": 0.6543, + "step": 2845 + }, + { + "epoch": 0.22, + "grad_norm": 1.1934411353340328, + "learning_rate": 1.815116636413927e-05, + "loss": 0.5741, + "step": 2846 + }, + { + "epoch": 0.22, + "grad_norm": 1.2367567913207473, + "learning_rate": 1.8149710524337143e-05, + "loss": 0.5939, + "step": 2847 + }, + { + "epoch": 0.22, + "grad_norm": 1.3560427459010786, + "learning_rate": 1.8148254170001743e-05, + "loss": 0.6495, + "step": 2848 + }, + { + "epoch": 0.22, + "grad_norm": 1.263134377740612, + "learning_rate": 1.8146797301225017e-05, + "loss": 0.6687, + "step": 2849 + }, + { + "epoch": 0.22, + "grad_norm": 1.173790701017271, + "learning_rate": 1.8145339918098944e-05, + "loss": 0.6114, + "step": 2850 + }, + { + "epoch": 0.22, + "grad_norm": 1.1777002636475393, + "learning_rate": 1.8143882020715537e-05, + "loss": 0.5727, + "step": 2851 + }, + { + "epoch": 0.22, + "grad_norm": 1.1990217115331079, + "learning_rate": 1.8142423609166845e-05, + "loss": 0.6531, + "step": 2852 + }, + { + "epoch": 0.22, + "grad_norm": 1.2039315628866218, + "learning_rate": 1.8140964683544936e-05, + "loss": 0.626, + "step": 2853 + }, + { + "epoch": 0.22, + "grad_norm": 1.2090947558373764, + "learning_rate": 1.813950524394193e-05, + "loss": 0.613, + "step": 2854 + }, + { + "epoch": 0.22, + "grad_norm": 1.0763227907347506, + "learning_rate": 1.813804529044996e-05, + "loss": 0.5894, + "step": 2855 + }, + { + "epoch": 0.22, + "grad_norm": 1.103214621212828, + "learning_rate": 1.8136584823161205e-05, + "loss": 0.5624, + "step": 2856 + }, + { + "epoch": 0.22, + "grad_norm": 1.279610398872814, + "learning_rate": 1.8135123842167873e-05, + "loss": 0.5917, + "step": 2857 + }, + { + "epoch": 0.22, + "grad_norm": 1.1060580243278069, + "learning_rate": 1.8133662347562197e-05, + "loss": 0.5864, + "step": 2858 + }, + { + "epoch": 0.22, + "grad_norm": 1.159203391728596, + "learning_rate": 1.8132200339436455e-05, + "loss": 0.5723, + "step": 2859 + }, + { + "epoch": 0.22, + "grad_norm": 1.2723583961795326, + "learning_rate": 1.813073781788295e-05, + "loss": 0.6118, + "step": 2860 + }, + { + "epoch": 0.22, + "grad_norm": 1.2273011140077157, + "learning_rate": 1.8129274782994016e-05, + "loss": 0.6806, + "step": 2861 + }, + { + "epoch": 0.22, + "grad_norm": 1.2921420217350321, + "learning_rate": 1.8127811234862026e-05, + "loss": 0.6283, + "step": 2862 + }, + { + "epoch": 0.22, + "grad_norm": 1.2323564369977353, + "learning_rate": 1.8126347173579373e-05, + "loss": 0.6112, + "step": 2863 + }, + { + "epoch": 0.22, + "grad_norm": 1.207304938931562, + "learning_rate": 1.8124882599238504e-05, + "loss": 0.6128, + "step": 2864 + }, + { + "epoch": 0.22, + "grad_norm": 1.1578066373977807, + "learning_rate": 1.812341751193187e-05, + "loss": 0.6047, + "step": 2865 + }, + { + "epoch": 0.22, + "grad_norm": 1.4005586956898046, + "learning_rate": 1.812195191175198e-05, + "loss": 0.6594, + "step": 2866 + }, + { + "epoch": 0.22, + "grad_norm": 1.228123540125166, + "learning_rate": 1.8120485798791364e-05, + "loss": 0.6249, + "step": 2867 + }, + { + "epoch": 0.22, + "grad_norm": 1.1568415391597346, + "learning_rate": 1.8119019173142583e-05, + "loss": 0.6069, + "step": 2868 + }, + { + "epoch": 0.22, + "grad_norm": 1.1911612837300505, + "learning_rate": 1.811755203489823e-05, + "loss": 0.6288, + "step": 2869 + }, + { + "epoch": 0.22, + "grad_norm": 1.343495943759691, + "learning_rate": 1.811608438415094e-05, + "loss": 0.6332, + "step": 2870 + }, + { + "epoch": 0.22, + "grad_norm": 1.2407355309741295, + "learning_rate": 1.8114616220993367e-05, + "loss": 0.6265, + "step": 2871 + }, + { + "epoch": 0.22, + "grad_norm": 1.2773169185141842, + "learning_rate": 1.8113147545518207e-05, + "loss": 0.626, + "step": 2872 + }, + { + "epoch": 0.22, + "grad_norm": 1.252796050011533, + "learning_rate": 1.8111678357818184e-05, + "loss": 0.6127, + "step": 2873 + }, + { + "epoch": 0.22, + "grad_norm": 1.19824671414118, + "learning_rate": 1.8110208657986056e-05, + "loss": 0.6131, + "step": 2874 + }, + { + "epoch": 0.22, + "grad_norm": 1.1883097949289403, + "learning_rate": 1.810873844611461e-05, + "loss": 0.5985, + "step": 2875 + }, + { + "epoch": 0.22, + "grad_norm": 1.2814209521617395, + "learning_rate": 1.810726772229667e-05, + "loss": 0.6164, + "step": 2876 + }, + { + "epoch": 0.22, + "grad_norm": 1.2923911841681872, + "learning_rate": 1.8105796486625095e-05, + "loss": 0.6373, + "step": 2877 + }, + { + "epoch": 0.22, + "grad_norm": 1.2575262940617395, + "learning_rate": 1.8104324739192766e-05, + "loss": 0.6791, + "step": 2878 + }, + { + "epoch": 0.22, + "grad_norm": 1.3457072884947838, + "learning_rate": 1.81028524800926e-05, + "loss": 0.6028, + "step": 2879 + }, + { + "epoch": 0.22, + "grad_norm": 1.2840021859266266, + "learning_rate": 1.8101379709417556e-05, + "loss": 0.6638, + "step": 2880 + }, + { + "epoch": 0.22, + "grad_norm": 1.2326226169186743, + "learning_rate": 1.809990642726061e-05, + "loss": 0.6518, + "step": 2881 + }, + { + "epoch": 0.22, + "grad_norm": 1.1997694807368846, + "learning_rate": 1.809843263371478e-05, + "loss": 0.5913, + "step": 2882 + }, + { + "epoch": 0.22, + "grad_norm": 1.2629495764488314, + "learning_rate": 1.809695832887312e-05, + "loss": 0.6485, + "step": 2883 + }, + { + "epoch": 0.22, + "grad_norm": 1.099901448516766, + "learning_rate": 1.8095483512828705e-05, + "loss": 0.6097, + "step": 2884 + }, + { + "epoch": 0.22, + "grad_norm": 1.231177328840679, + "learning_rate": 1.809400818567465e-05, + "loss": 0.6192, + "step": 2885 + }, + { + "epoch": 0.22, + "grad_norm": 1.3685103637782334, + "learning_rate": 1.809253234750409e-05, + "loss": 0.641, + "step": 2886 + }, + { + "epoch": 0.22, + "grad_norm": 1.1997628235875848, + "learning_rate": 1.809105599841022e-05, + "loss": 0.6401, + "step": 2887 + }, + { + "epoch": 0.22, + "grad_norm": 1.0968343026199854, + "learning_rate": 1.8089579138486234e-05, + "loss": 0.5575, + "step": 2888 + }, + { + "epoch": 0.22, + "grad_norm": 1.2138150879903193, + "learning_rate": 1.8088101767825385e-05, + "loss": 0.6557, + "step": 2889 + }, + { + "epoch": 0.22, + "grad_norm": 1.2339204361674816, + "learning_rate": 1.8086623886520942e-05, + "loss": 0.6035, + "step": 2890 + }, + { + "epoch": 0.22, + "grad_norm": 1.2712415237669306, + "learning_rate": 1.8085145494666208e-05, + "loss": 0.6331, + "step": 2891 + }, + { + "epoch": 0.22, + "grad_norm": 1.2377801115733877, + "learning_rate": 1.808366659235453e-05, + "loss": 0.5811, + "step": 2892 + }, + { + "epoch": 0.22, + "grad_norm": 1.3286606101312584, + "learning_rate": 1.8082187179679272e-05, + "loss": 0.6674, + "step": 2893 + }, + { + "epoch": 0.22, + "grad_norm": 1.0994433316610714, + "learning_rate": 1.8080707256733837e-05, + "loss": 0.566, + "step": 2894 + }, + { + "epoch": 0.22, + "grad_norm": 1.223931181045098, + "learning_rate": 1.8079226823611665e-05, + "loss": 0.6318, + "step": 2895 + }, + { + "epoch": 0.22, + "grad_norm": 1.255745984975725, + "learning_rate": 1.807774588040622e-05, + "loss": 0.6282, + "step": 2896 + }, + { + "epoch": 0.22, + "grad_norm": 1.329411993834681, + "learning_rate": 1.8076264427210997e-05, + "loss": 0.6458, + "step": 2897 + }, + { + "epoch": 0.22, + "grad_norm": 1.3125825129412299, + "learning_rate": 1.8074782464119536e-05, + "loss": 0.6372, + "step": 2898 + }, + { + "epoch": 0.22, + "grad_norm": 1.267016881705333, + "learning_rate": 1.8073299991225398e-05, + "loss": 0.5849, + "step": 2899 + }, + { + "epoch": 0.22, + "grad_norm": 1.2533980435971066, + "learning_rate": 1.8071817008622177e-05, + "loss": 0.628, + "step": 2900 + }, + { + "epoch": 0.23, + "grad_norm": 1.2003904700783439, + "learning_rate": 1.8070333516403505e-05, + "loss": 0.5822, + "step": 2901 + }, + { + "epoch": 0.23, + "grad_norm": 1.2487166015160502, + "learning_rate": 1.806884951466304e-05, + "loss": 0.6062, + "step": 2902 + }, + { + "epoch": 0.23, + "grad_norm": 1.3581030584143734, + "learning_rate": 1.8067365003494475e-05, + "loss": 0.6601, + "step": 2903 + }, + { + "epoch": 0.23, + "grad_norm": 1.2631206459902808, + "learning_rate": 1.8065879982991536e-05, + "loss": 0.5929, + "step": 2904 + }, + { + "epoch": 0.23, + "grad_norm": 1.2821168990644012, + "learning_rate": 1.8064394453247977e-05, + "loss": 0.5959, + "step": 2905 + }, + { + "epoch": 0.23, + "grad_norm": 1.2204427945062277, + "learning_rate": 1.8062908414357592e-05, + "loss": 0.5873, + "step": 2906 + }, + { + "epoch": 0.23, + "grad_norm": 1.248558500725858, + "learning_rate": 1.80614218664142e-05, + "loss": 0.623, + "step": 2907 + }, + { + "epoch": 0.23, + "grad_norm": 1.2202886503382502, + "learning_rate": 1.8059934809511654e-05, + "loss": 0.5808, + "step": 2908 + }, + { + "epoch": 0.23, + "grad_norm": 1.2979357414415547, + "learning_rate": 1.805844724374384e-05, + "loss": 0.6001, + "step": 2909 + }, + { + "epoch": 0.23, + "grad_norm": 1.2011587746999037, + "learning_rate": 1.805695916920467e-05, + "loss": 0.5736, + "step": 2910 + }, + { + "epoch": 0.23, + "grad_norm": 1.3875532191612105, + "learning_rate": 1.8055470585988108e-05, + "loss": 0.6179, + "step": 2911 + }, + { + "epoch": 0.23, + "grad_norm": 1.2902609138404146, + "learning_rate": 1.805398149418812e-05, + "loss": 0.6515, + "step": 2912 + }, + { + "epoch": 0.23, + "grad_norm": 1.3347383884311403, + "learning_rate": 1.805249189389873e-05, + "loss": 0.6505, + "step": 2913 + }, + { + "epoch": 0.23, + "grad_norm": 1.3294166118671473, + "learning_rate": 1.805100178521398e-05, + "loss": 0.6658, + "step": 2914 + }, + { + "epoch": 0.23, + "grad_norm": 1.2300501035549545, + "learning_rate": 1.804951116822795e-05, + "loss": 0.6171, + "step": 2915 + }, + { + "epoch": 0.23, + "grad_norm": 1.2054727115430035, + "learning_rate": 1.804802004303475e-05, + "loss": 0.6213, + "step": 2916 + }, + { + "epoch": 0.23, + "grad_norm": 1.3563128654571248, + "learning_rate": 1.8046528409728523e-05, + "loss": 0.6245, + "step": 2917 + }, + { + "epoch": 0.23, + "grad_norm": 1.3001366928567972, + "learning_rate": 1.8045036268403443e-05, + "loss": 0.6708, + "step": 2918 + }, + { + "epoch": 0.23, + "grad_norm": 1.1874836870378034, + "learning_rate": 1.8043543619153717e-05, + "loss": 0.6237, + "step": 2919 + }, + { + "epoch": 0.23, + "grad_norm": 1.256463885722255, + "learning_rate": 1.804205046207358e-05, + "loss": 0.6283, + "step": 2920 + }, + { + "epoch": 0.23, + "grad_norm": 1.2557813937595423, + "learning_rate": 1.8040556797257308e-05, + "loss": 0.5638, + "step": 2921 + }, + { + "epoch": 0.23, + "grad_norm": 1.2026769125661274, + "learning_rate": 1.80390626247992e-05, + "loss": 0.6409, + "step": 2922 + }, + { + "epoch": 0.23, + "grad_norm": 1.2871283152088284, + "learning_rate": 1.80375679447936e-05, + "loss": 0.6575, + "step": 2923 + }, + { + "epoch": 0.23, + "grad_norm": 1.2586342159232753, + "learning_rate": 1.803607275733486e-05, + "loss": 0.6328, + "step": 2924 + }, + { + "epoch": 0.23, + "grad_norm": 1.1032225092944932, + "learning_rate": 1.8034577062517383e-05, + "loss": 0.5829, + "step": 2925 + }, + { + "epoch": 0.23, + "grad_norm": 1.3104410370388593, + "learning_rate": 1.803308086043561e-05, + "loss": 0.6742, + "step": 2926 + }, + { + "epoch": 0.23, + "grad_norm": 1.2051548863822021, + "learning_rate": 1.8031584151183995e-05, + "loss": 0.5936, + "step": 2927 + }, + { + "epoch": 0.23, + "grad_norm": 1.2517796722089043, + "learning_rate": 1.8030086934857034e-05, + "loss": 0.6459, + "step": 2928 + }, + { + "epoch": 0.23, + "grad_norm": 1.2320767507333181, + "learning_rate": 1.8028589211549256e-05, + "loss": 0.603, + "step": 2929 + }, + { + "epoch": 0.23, + "grad_norm": 1.2086792100061583, + "learning_rate": 1.8027090981355217e-05, + "loss": 0.6299, + "step": 2930 + }, + { + "epoch": 0.23, + "grad_norm": 1.2700898804736247, + "learning_rate": 1.8025592244369514e-05, + "loss": 0.6605, + "step": 2931 + }, + { + "epoch": 0.23, + "grad_norm": 1.206916927121023, + "learning_rate": 1.802409300068676e-05, + "loss": 0.614, + "step": 2932 + }, + { + "epoch": 0.23, + "grad_norm": 1.168172608002909, + "learning_rate": 1.8022593250401625e-05, + "loss": 0.625, + "step": 2933 + }, + { + "epoch": 0.23, + "grad_norm": 1.1834166240034456, + "learning_rate": 1.8021092993608777e-05, + "loss": 0.5964, + "step": 2934 + }, + { + "epoch": 0.23, + "grad_norm": 1.2788494696389363, + "learning_rate": 1.801959223040295e-05, + "loss": 0.6234, + "step": 2935 + }, + { + "epoch": 0.23, + "grad_norm": 1.187926767860014, + "learning_rate": 1.801809096087889e-05, + "loss": 0.6049, + "step": 2936 + }, + { + "epoch": 0.23, + "grad_norm": 1.267593407208171, + "learning_rate": 1.801658918513138e-05, + "loss": 0.6388, + "step": 2937 + }, + { + "epoch": 0.23, + "grad_norm": 1.136411330349278, + "learning_rate": 1.801508690325523e-05, + "loss": 0.5344, + "step": 2938 + }, + { + "epoch": 0.23, + "grad_norm": 1.1600192724468503, + "learning_rate": 1.8013584115345297e-05, + "loss": 0.5305, + "step": 2939 + }, + { + "epoch": 0.23, + "grad_norm": 1.2332262899152213, + "learning_rate": 1.8012080821496454e-05, + "loss": 0.583, + "step": 2940 + }, + { + "epoch": 0.23, + "grad_norm": 1.3254157313839539, + "learning_rate": 1.8010577021803608e-05, + "loss": 0.6546, + "step": 2941 + }, + { + "epoch": 0.23, + "grad_norm": 1.2185395866102802, + "learning_rate": 1.8009072716361707e-05, + "loss": 0.6275, + "step": 2942 + }, + { + "epoch": 0.23, + "grad_norm": 1.271840316711964, + "learning_rate": 1.8007567905265727e-05, + "loss": 0.613, + "step": 2943 + }, + { + "epoch": 0.23, + "grad_norm": 1.169847132567219, + "learning_rate": 1.8006062588610666e-05, + "loss": 0.5647, + "step": 2944 + }, + { + "epoch": 0.23, + "grad_norm": 1.288062800386875, + "learning_rate": 1.8004556766491573e-05, + "loss": 0.6258, + "step": 2945 + }, + { + "epoch": 0.23, + "grad_norm": 1.1649247630632729, + "learning_rate": 1.800305043900351e-05, + "loss": 0.5184, + "step": 2946 + }, + { + "epoch": 0.23, + "grad_norm": 1.1514814868520415, + "learning_rate": 1.8001543606241583e-05, + "loss": 0.6081, + "step": 2947 + }, + { + "epoch": 0.23, + "grad_norm": 1.1776919127949836, + "learning_rate": 1.8000036268300927e-05, + "loss": 0.5703, + "step": 2948 + }, + { + "epoch": 0.23, + "grad_norm": 1.3064339193428178, + "learning_rate": 1.7998528425276703e-05, + "loss": 0.6157, + "step": 2949 + }, + { + "epoch": 0.23, + "grad_norm": 1.2702490083374884, + "learning_rate": 1.7997020077264118e-05, + "loss": 0.6398, + "step": 2950 + }, + { + "epoch": 0.23, + "grad_norm": 1.2168450261809818, + "learning_rate": 1.799551122435839e-05, + "loss": 0.6088, + "step": 2951 + }, + { + "epoch": 0.23, + "grad_norm": 1.150140032328074, + "learning_rate": 1.7994001866654794e-05, + "loss": 0.5849, + "step": 2952 + }, + { + "epoch": 0.23, + "grad_norm": 1.0685143506555084, + "learning_rate": 1.7992492004248615e-05, + "loss": 0.5281, + "step": 2953 + }, + { + "epoch": 0.23, + "grad_norm": 1.2763959068563853, + "learning_rate": 1.7990981637235174e-05, + "loss": 0.607, + "step": 2954 + }, + { + "epoch": 0.23, + "grad_norm": 1.1506804277702987, + "learning_rate": 1.798947076570984e-05, + "loss": 0.5811, + "step": 2955 + }, + { + "epoch": 0.23, + "grad_norm": 1.1526404564556179, + "learning_rate": 1.7987959389767993e-05, + "loss": 0.6009, + "step": 2956 + }, + { + "epoch": 0.23, + "grad_norm": 1.2005724892360312, + "learning_rate": 1.798644750950506e-05, + "loss": 0.6374, + "step": 2957 + }, + { + "epoch": 0.23, + "grad_norm": 1.14599489749152, + "learning_rate": 1.798493512501649e-05, + "loss": 0.5896, + "step": 2958 + }, + { + "epoch": 0.23, + "grad_norm": 1.1765210431864042, + "learning_rate": 1.798342223639777e-05, + "loss": 0.6396, + "step": 2959 + }, + { + "epoch": 0.23, + "grad_norm": 1.2504256954119262, + "learning_rate": 1.7981908843744413e-05, + "loss": 0.574, + "step": 2960 + }, + { + "epoch": 0.23, + "grad_norm": 1.2331037614852411, + "learning_rate": 1.7980394947151972e-05, + "loss": 0.5922, + "step": 2961 + }, + { + "epoch": 0.23, + "grad_norm": 1.329365364258588, + "learning_rate": 1.797888054671602e-05, + "loss": 0.6719, + "step": 2962 + }, + { + "epoch": 0.23, + "grad_norm": 1.2572771437637662, + "learning_rate": 1.7977365642532176e-05, + "loss": 0.6657, + "step": 2963 + }, + { + "epoch": 0.23, + "grad_norm": 1.2879552999079014, + "learning_rate": 1.7975850234696084e-05, + "loss": 0.6882, + "step": 2964 + }, + { + "epoch": 0.23, + "grad_norm": 1.2407461477170283, + "learning_rate": 1.7974334323303414e-05, + "loss": 0.6057, + "step": 2965 + }, + { + "epoch": 0.23, + "grad_norm": 1.2215059859776771, + "learning_rate": 1.7972817908449875e-05, + "loss": 0.6278, + "step": 2966 + }, + { + "epoch": 0.23, + "grad_norm": 1.167622696761847, + "learning_rate": 1.7971300990231208e-05, + "loss": 0.6075, + "step": 2967 + }, + { + "epoch": 0.23, + "grad_norm": 1.1778253169082262, + "learning_rate": 1.7969783568743185e-05, + "loss": 0.6018, + "step": 2968 + }, + { + "epoch": 0.23, + "grad_norm": 1.2269449305997007, + "learning_rate": 1.7968265644081603e-05, + "loss": 0.5796, + "step": 2969 + }, + { + "epoch": 0.23, + "grad_norm": 1.31491470734766, + "learning_rate": 1.7966747216342305e-05, + "loss": 0.6621, + "step": 2970 + }, + { + "epoch": 0.23, + "grad_norm": 1.266386627101717, + "learning_rate": 1.7965228285621148e-05, + "loss": 0.6113, + "step": 2971 + }, + { + "epoch": 0.23, + "grad_norm": 1.1610754719171927, + "learning_rate": 1.7963708852014034e-05, + "loss": 0.5741, + "step": 2972 + }, + { + "epoch": 0.23, + "grad_norm": 1.2845976290844119, + "learning_rate": 1.7962188915616896e-05, + "loss": 0.6657, + "step": 2973 + }, + { + "epoch": 0.23, + "grad_norm": 1.130304124467323, + "learning_rate": 1.7960668476525688e-05, + "loss": 0.6081, + "step": 2974 + }, + { + "epoch": 0.23, + "grad_norm": 1.2494207470577738, + "learning_rate": 1.795914753483641e-05, + "loss": 0.5918, + "step": 2975 + }, + { + "epoch": 0.23, + "grad_norm": 1.2301678486406489, + "learning_rate": 1.7957626090645085e-05, + "loss": 0.619, + "step": 2976 + }, + { + "epoch": 0.23, + "grad_norm": 1.1322886867302662, + "learning_rate": 1.7956104144047768e-05, + "loss": 0.5837, + "step": 2977 + }, + { + "epoch": 0.23, + "grad_norm": 1.1486553063449865, + "learning_rate": 1.795458169514055e-05, + "loss": 0.5828, + "step": 2978 + }, + { + "epoch": 0.23, + "grad_norm": 1.160789137302683, + "learning_rate": 1.7953058744019546e-05, + "loss": 0.5474, + "step": 2979 + }, + { + "epoch": 0.23, + "grad_norm": 1.2342475632181233, + "learning_rate": 1.7951535290780913e-05, + "loss": 0.5618, + "step": 2980 + }, + { + "epoch": 0.23, + "grad_norm": 1.270766376336555, + "learning_rate": 1.7950011335520832e-05, + "loss": 0.5931, + "step": 2981 + }, + { + "epoch": 0.23, + "grad_norm": 1.2507334941298718, + "learning_rate": 1.7948486878335522e-05, + "loss": 0.5865, + "step": 2982 + }, + { + "epoch": 0.23, + "grad_norm": 1.5868925779795653, + "learning_rate": 1.7946961919321225e-05, + "loss": 0.6711, + "step": 2983 + }, + { + "epoch": 0.23, + "grad_norm": 1.1474935064360614, + "learning_rate": 1.7945436458574216e-05, + "loss": 0.5709, + "step": 2984 + }, + { + "epoch": 0.23, + "grad_norm": 1.1264940513524888, + "learning_rate": 1.7943910496190816e-05, + "loss": 0.584, + "step": 2985 + }, + { + "epoch": 0.23, + "grad_norm": 1.2317083986789328, + "learning_rate": 1.794238403226736e-05, + "loss": 0.6154, + "step": 2986 + }, + { + "epoch": 0.23, + "grad_norm": 1.1544663874415577, + "learning_rate": 1.7940857066900223e-05, + "loss": 0.5924, + "step": 2987 + }, + { + "epoch": 0.23, + "grad_norm": 1.277005445377626, + "learning_rate": 1.7939329600185807e-05, + "loss": 0.6846, + "step": 2988 + }, + { + "epoch": 0.23, + "grad_norm": 1.1453012213470386, + "learning_rate": 1.7937801632220556e-05, + "loss": 0.5462, + "step": 2989 + }, + { + "epoch": 0.23, + "grad_norm": 1.2065896025614506, + "learning_rate": 1.793627316310093e-05, + "loss": 0.589, + "step": 2990 + }, + { + "epoch": 0.23, + "grad_norm": 1.1848137236710226, + "learning_rate": 1.7934744192923436e-05, + "loss": 0.6013, + "step": 2991 + }, + { + "epoch": 0.23, + "grad_norm": 1.2619593246270835, + "learning_rate": 1.7933214721784602e-05, + "loss": 0.6333, + "step": 2992 + }, + { + "epoch": 0.23, + "grad_norm": 1.2150648596606142, + "learning_rate": 1.7931684749780994e-05, + "loss": 0.6105, + "step": 2993 + }, + { + "epoch": 0.23, + "grad_norm": 1.2555330840400099, + "learning_rate": 1.7930154277009207e-05, + "loss": 0.654, + "step": 2994 + }, + { + "epoch": 0.23, + "grad_norm": 1.1743399064533986, + "learning_rate": 1.792862330356586e-05, + "loss": 0.6053, + "step": 2995 + }, + { + "epoch": 0.23, + "grad_norm": 1.164000387902867, + "learning_rate": 1.7927091829547624e-05, + "loss": 0.541, + "step": 2996 + }, + { + "epoch": 0.23, + "grad_norm": 1.2705627476443653, + "learning_rate": 1.792555985505118e-05, + "loss": 0.6016, + "step": 2997 + }, + { + "epoch": 0.23, + "grad_norm": 1.2389304688427403, + "learning_rate": 1.7924027380173253e-05, + "loss": 0.5954, + "step": 2998 + }, + { + "epoch": 0.23, + "grad_norm": 1.1266852577827493, + "learning_rate": 1.7922494405010593e-05, + "loss": 0.691, + "step": 2999 + }, + { + "epoch": 0.23, + "grad_norm": 1.2202408304305663, + "learning_rate": 1.792096092965999e-05, + "loss": 0.6233, + "step": 3000 + }, + { + "epoch": 0.23, + "grad_norm": 1.2500432960641452, + "learning_rate": 1.7919426954218252e-05, + "loss": 0.6002, + "step": 3001 + }, + { + "epoch": 0.23, + "grad_norm": 1.2753983828905995, + "learning_rate": 1.7917892478782234e-05, + "loss": 0.6494, + "step": 3002 + }, + { + "epoch": 0.23, + "grad_norm": 1.1316247962000014, + "learning_rate": 1.791635750344881e-05, + "loss": 0.5839, + "step": 3003 + }, + { + "epoch": 0.23, + "grad_norm": 1.3513382742223259, + "learning_rate": 1.79148220283149e-05, + "loss": 0.6534, + "step": 3004 + }, + { + "epoch": 0.23, + "grad_norm": 1.3354423006704952, + "learning_rate": 1.7913286053477434e-05, + "loss": 0.689, + "step": 3005 + }, + { + "epoch": 0.23, + "grad_norm": 1.293549790301838, + "learning_rate": 1.7911749579033394e-05, + "loss": 0.7231, + "step": 3006 + }, + { + "epoch": 0.23, + "grad_norm": 1.1181166193196503, + "learning_rate": 1.7910212605079788e-05, + "loss": 0.5189, + "step": 3007 + }, + { + "epoch": 0.23, + "grad_norm": 1.2468709883469986, + "learning_rate": 1.7908675131713642e-05, + "loss": 0.5978, + "step": 3008 + }, + { + "epoch": 0.23, + "grad_norm": 1.1993187480574705, + "learning_rate": 1.7907137159032036e-05, + "loss": 0.6734, + "step": 3009 + }, + { + "epoch": 0.23, + "grad_norm": 1.0643164870731645, + "learning_rate": 1.7905598687132064e-05, + "loss": 0.5736, + "step": 3010 + }, + { + "epoch": 0.23, + "grad_norm": 1.2758891371306944, + "learning_rate": 1.790405971611086e-05, + "loss": 0.589, + "step": 3011 + }, + { + "epoch": 0.23, + "grad_norm": 1.2234611421251935, + "learning_rate": 1.790252024606559e-05, + "loss": 0.6108, + "step": 3012 + }, + { + "epoch": 0.23, + "grad_norm": 1.1952096234100136, + "learning_rate": 1.7900980277093438e-05, + "loss": 0.6469, + "step": 3013 + }, + { + "epoch": 0.23, + "grad_norm": 1.3264603448541694, + "learning_rate": 1.7899439809291643e-05, + "loss": 0.5981, + "step": 3014 + }, + { + "epoch": 0.23, + "grad_norm": 1.1788633891126337, + "learning_rate": 1.7897898842757455e-05, + "loss": 0.577, + "step": 3015 + }, + { + "epoch": 0.23, + "grad_norm": 1.1995112655702147, + "learning_rate": 1.7896357377588165e-05, + "loss": 0.5554, + "step": 3016 + }, + { + "epoch": 0.23, + "grad_norm": 1.3007924763879526, + "learning_rate": 1.7894815413881096e-05, + "loss": 0.693, + "step": 3017 + }, + { + "epoch": 0.23, + "grad_norm": 1.31782970950039, + "learning_rate": 1.7893272951733596e-05, + "loss": 0.6413, + "step": 3018 + }, + { + "epoch": 0.23, + "grad_norm": 1.345985061942385, + "learning_rate": 1.789172999124305e-05, + "loss": 0.6494, + "step": 3019 + }, + { + "epoch": 0.23, + "grad_norm": 1.153175855306034, + "learning_rate": 1.7890186532506875e-05, + "loss": 0.5905, + "step": 3020 + }, + { + "epoch": 0.23, + "grad_norm": 1.2546485771327105, + "learning_rate": 1.788864257562252e-05, + "loss": 0.6302, + "step": 3021 + }, + { + "epoch": 0.23, + "grad_norm": 1.2414322962588247, + "learning_rate": 1.788709812068745e-05, + "loss": 0.6255, + "step": 3022 + }, + { + "epoch": 0.23, + "grad_norm": 1.1241496899291201, + "learning_rate": 1.788555316779919e-05, + "loss": 0.6019, + "step": 3023 + }, + { + "epoch": 0.23, + "grad_norm": 1.2547422575254765, + "learning_rate": 1.7884007717055273e-05, + "loss": 0.6121, + "step": 3024 + }, + { + "epoch": 0.23, + "grad_norm": 1.2316432131356856, + "learning_rate": 1.7882461768553275e-05, + "loss": 0.5705, + "step": 3025 + }, + { + "epoch": 0.23, + "grad_norm": 1.209348460455779, + "learning_rate": 1.7880915322390794e-05, + "loss": 0.6658, + "step": 3026 + }, + { + "epoch": 0.23, + "grad_norm": 1.331915484009588, + "learning_rate": 1.787936837866547e-05, + "loss": 0.6434, + "step": 3027 + }, + { + "epoch": 0.23, + "grad_norm": 1.3494130501057997, + "learning_rate": 1.7877820937474966e-05, + "loss": 0.6342, + "step": 3028 + }, + { + "epoch": 0.23, + "grad_norm": 1.2896999285879898, + "learning_rate": 1.7876272998916987e-05, + "loss": 0.6691, + "step": 3029 + }, + { + "epoch": 0.24, + "grad_norm": 1.2085400875830674, + "learning_rate": 1.787472456308925e-05, + "loss": 0.5967, + "step": 3030 + }, + { + "epoch": 0.24, + "grad_norm": 1.1905624653571254, + "learning_rate": 1.787317563008953e-05, + "loss": 0.6734, + "step": 3031 + }, + { + "epoch": 0.24, + "grad_norm": 1.2800279697700183, + "learning_rate": 1.7871626200015607e-05, + "loss": 0.7052, + "step": 3032 + }, + { + "epoch": 0.24, + "grad_norm": 1.1663746752237185, + "learning_rate": 1.7870076272965313e-05, + "loss": 0.5192, + "step": 3033 + }, + { + "epoch": 0.24, + "grad_norm": 1.2075594638217475, + "learning_rate": 1.78685258490365e-05, + "loss": 0.5684, + "step": 3034 + }, + { + "epoch": 0.24, + "grad_norm": 1.1376299794310991, + "learning_rate": 1.7866974928327052e-05, + "loss": 0.5841, + "step": 3035 + }, + { + "epoch": 0.24, + "grad_norm": 1.2071017149075514, + "learning_rate": 1.7865423510934888e-05, + "loss": 0.5715, + "step": 3036 + }, + { + "epoch": 0.24, + "grad_norm": 1.0675487616165151, + "learning_rate": 1.7863871596957963e-05, + "loss": 0.5698, + "step": 3037 + }, + { + "epoch": 0.24, + "grad_norm": 1.3310970039386116, + "learning_rate": 1.7862319186494245e-05, + "loss": 0.6288, + "step": 3038 + }, + { + "epoch": 0.24, + "grad_norm": 1.137937174616564, + "learning_rate": 1.7860766279641758e-05, + "loss": 0.6085, + "step": 3039 + }, + { + "epoch": 0.24, + "grad_norm": 1.1180286575356846, + "learning_rate": 1.7859212876498536e-05, + "loss": 0.5583, + "step": 3040 + }, + { + "epoch": 0.24, + "grad_norm": 1.184991143833337, + "learning_rate": 1.7857658977162655e-05, + "loss": 0.5962, + "step": 3041 + }, + { + "epoch": 0.24, + "grad_norm": 1.1759104610176012, + "learning_rate": 1.7856104581732227e-05, + "loss": 0.6277, + "step": 3042 + }, + { + "epoch": 0.24, + "grad_norm": 1.216858006590385, + "learning_rate": 1.785454969030538e-05, + "loss": 0.6217, + "step": 3043 + }, + { + "epoch": 0.24, + "grad_norm": 1.218961110537572, + "learning_rate": 1.785299430298029e-05, + "loss": 0.5992, + "step": 3044 + }, + { + "epoch": 0.24, + "grad_norm": 1.1718555703141884, + "learning_rate": 1.7851438419855157e-05, + "loss": 0.5835, + "step": 3045 + }, + { + "epoch": 0.24, + "grad_norm": 1.0695880749405033, + "learning_rate": 1.7849882041028203e-05, + "loss": 0.539, + "step": 3046 + }, + { + "epoch": 0.24, + "grad_norm": 1.0857060137680985, + "learning_rate": 1.7848325166597698e-05, + "loss": 0.555, + "step": 3047 + }, + { + "epoch": 0.24, + "grad_norm": 1.255971900561838, + "learning_rate": 1.7846767796661934e-05, + "loss": 0.6104, + "step": 3048 + }, + { + "epoch": 0.24, + "grad_norm": 1.21615372954044, + "learning_rate": 1.7845209931319232e-05, + "loss": 0.657, + "step": 3049 + }, + { + "epoch": 0.24, + "grad_norm": 1.1972051857259582, + "learning_rate": 1.7843651570667953e-05, + "loss": 0.5963, + "step": 3050 + }, + { + "epoch": 0.24, + "grad_norm": 1.1667782128460005, + "learning_rate": 1.784209271480648e-05, + "loss": 0.6278, + "step": 3051 + }, + { + "epoch": 0.24, + "grad_norm": 1.2684358536737081, + "learning_rate": 1.7840533363833238e-05, + "loss": 0.5818, + "step": 3052 + }, + { + "epoch": 0.24, + "grad_norm": 1.3054852359627946, + "learning_rate": 1.783897351784667e-05, + "loss": 0.6701, + "step": 3053 + }, + { + "epoch": 0.24, + "grad_norm": 1.2808090474629414, + "learning_rate": 1.7837413176945263e-05, + "loss": 0.5995, + "step": 3054 + }, + { + "epoch": 0.24, + "grad_norm": 1.26955472484306, + "learning_rate": 1.7835852341227523e-05, + "loss": 0.6308, + "step": 3055 + }, + { + "epoch": 0.24, + "grad_norm": 1.3128604848001724, + "learning_rate": 1.7834291010791998e-05, + "loss": 0.6654, + "step": 3056 + }, + { + "epoch": 0.24, + "grad_norm": 1.185201025703545, + "learning_rate": 1.783272918573726e-05, + "loss": 0.6088, + "step": 3057 + }, + { + "epoch": 0.24, + "grad_norm": 1.1497198198923226, + "learning_rate": 1.7831166866161923e-05, + "loss": 0.625, + "step": 3058 + }, + { + "epoch": 0.24, + "grad_norm": 1.135485055012956, + "learning_rate": 1.7829604052164616e-05, + "loss": 0.5678, + "step": 3059 + }, + { + "epoch": 0.24, + "grad_norm": 1.0077649482331934, + "learning_rate": 1.7828040743844008e-05, + "loss": 0.5576, + "step": 3060 + }, + { + "epoch": 0.24, + "grad_norm": 1.1692780265785083, + "learning_rate": 1.7826476941298797e-05, + "loss": 0.5841, + "step": 3061 + }, + { + "epoch": 0.24, + "grad_norm": 1.2531621513662476, + "learning_rate": 1.782491264462772e-05, + "loss": 0.5952, + "step": 3062 + }, + { + "epoch": 0.24, + "grad_norm": 1.1774960809792259, + "learning_rate": 1.782334785392954e-05, + "loss": 0.6002, + "step": 3063 + }, + { + "epoch": 0.24, + "grad_norm": 1.3198974227785316, + "learning_rate": 1.7821782569303045e-05, + "loss": 0.6301, + "step": 3064 + }, + { + "epoch": 0.24, + "grad_norm": 1.2412421509954603, + "learning_rate": 1.782021679084706e-05, + "loss": 0.6383, + "step": 3065 + }, + { + "epoch": 0.24, + "grad_norm": 1.2281711016439567, + "learning_rate": 1.781865051866044e-05, + "loss": 0.5641, + "step": 3066 + }, + { + "epoch": 0.24, + "grad_norm": 1.2609974124860723, + "learning_rate": 1.781708375284208e-05, + "loss": 0.6578, + "step": 3067 + }, + { + "epoch": 0.24, + "grad_norm": 1.2377115377091337, + "learning_rate": 1.7815516493490888e-05, + "loss": 0.5483, + "step": 3068 + }, + { + "epoch": 0.24, + "grad_norm": 1.1886722901914275, + "learning_rate": 1.7813948740705816e-05, + "loss": 0.5809, + "step": 3069 + }, + { + "epoch": 0.24, + "grad_norm": 1.3063399763010588, + "learning_rate": 1.7812380494585846e-05, + "loss": 0.6425, + "step": 3070 + }, + { + "epoch": 0.24, + "grad_norm": 1.2197979554172487, + "learning_rate": 1.781081175522999e-05, + "loss": 0.6027, + "step": 3071 + }, + { + "epoch": 0.24, + "grad_norm": 1.3226034840008223, + "learning_rate": 1.780924252273729e-05, + "loss": 0.6882, + "step": 3072 + }, + { + "epoch": 0.24, + "grad_norm": 1.2641265381387161, + "learning_rate": 1.780767279720682e-05, + "loss": 0.593, + "step": 3073 + }, + { + "epoch": 0.24, + "grad_norm": 1.1856570247654552, + "learning_rate": 1.780610257873768e-05, + "loss": 0.5957, + "step": 3074 + }, + { + "epoch": 0.24, + "grad_norm": 1.2788455545590587, + "learning_rate": 1.7804531867429013e-05, + "loss": 0.5838, + "step": 3075 + }, + { + "epoch": 0.24, + "grad_norm": 1.0892644183290483, + "learning_rate": 1.780296066337998e-05, + "loss": 0.5805, + "step": 3076 + }, + { + "epoch": 0.24, + "grad_norm": 1.2363920029564532, + "learning_rate": 1.7801388966689784e-05, + "loss": 0.6094, + "step": 3077 + }, + { + "epoch": 0.24, + "grad_norm": 1.1601366754286944, + "learning_rate": 1.7799816777457653e-05, + "loss": 0.5863, + "step": 3078 + }, + { + "epoch": 0.24, + "grad_norm": 1.1786872210325052, + "learning_rate": 1.7798244095782847e-05, + "loss": 0.5889, + "step": 3079 + }, + { + "epoch": 0.24, + "grad_norm": 1.2907588547575215, + "learning_rate": 1.779667092176466e-05, + "loss": 0.6372, + "step": 3080 + }, + { + "epoch": 0.24, + "grad_norm": 1.2001794700725412, + "learning_rate": 1.779509725550241e-05, + "loss": 0.604, + "step": 3081 + }, + { + "epoch": 0.24, + "grad_norm": 1.2630275398417172, + "learning_rate": 1.7793523097095452e-05, + "loss": 0.5937, + "step": 3082 + }, + { + "epoch": 0.24, + "grad_norm": 1.2784739868954929, + "learning_rate": 1.7791948446643173e-05, + "loss": 0.563, + "step": 3083 + }, + { + "epoch": 0.24, + "grad_norm": 1.1527263975885382, + "learning_rate": 1.7790373304244986e-05, + "loss": 0.5368, + "step": 3084 + }, + { + "epoch": 0.24, + "grad_norm": 1.2456382469012621, + "learning_rate": 1.778879767000034e-05, + "loss": 0.6421, + "step": 3085 + }, + { + "epoch": 0.24, + "grad_norm": 1.2194838637457066, + "learning_rate": 1.7787221544008715e-05, + "loss": 0.5656, + "step": 3086 + }, + { + "epoch": 0.24, + "grad_norm": 1.273948192582068, + "learning_rate": 1.7785644926369616e-05, + "loss": 0.6647, + "step": 3087 + }, + { + "epoch": 0.24, + "grad_norm": 1.2184152387927174, + "learning_rate": 1.778406781718258e-05, + "loss": 0.6336, + "step": 3088 + }, + { + "epoch": 0.24, + "grad_norm": 1.3443164962034504, + "learning_rate": 1.7782490216547182e-05, + "loss": 0.5989, + "step": 3089 + }, + { + "epoch": 0.24, + "grad_norm": 1.2599534005676034, + "learning_rate": 1.778091212456303e-05, + "loss": 0.6655, + "step": 3090 + }, + { + "epoch": 0.24, + "grad_norm": 1.2018116805466454, + "learning_rate": 1.7779333541329745e-05, + "loss": 0.6561, + "step": 3091 + }, + { + "epoch": 0.24, + "grad_norm": 1.184220201748367, + "learning_rate": 1.7777754466947002e-05, + "loss": 0.5243, + "step": 3092 + }, + { + "epoch": 0.24, + "grad_norm": 1.1420561290916702, + "learning_rate": 1.7776174901514493e-05, + "loss": 0.5828, + "step": 3093 + }, + { + "epoch": 0.24, + "grad_norm": 1.1784786068087671, + "learning_rate": 1.777459484513194e-05, + "loss": 0.5923, + "step": 3094 + }, + { + "epoch": 0.24, + "grad_norm": 1.2604503574545003, + "learning_rate": 1.77730142978991e-05, + "loss": 0.5698, + "step": 3095 + }, + { + "epoch": 0.24, + "grad_norm": 1.2245177214910639, + "learning_rate": 1.7771433259915767e-05, + "loss": 0.6243, + "step": 3096 + }, + { + "epoch": 0.24, + "grad_norm": 1.1310737156374882, + "learning_rate": 1.7769851731281758e-05, + "loss": 0.5655, + "step": 3097 + }, + { + "epoch": 0.24, + "grad_norm": 1.2791102149427254, + "learning_rate": 1.7768269712096922e-05, + "loss": 0.5306, + "step": 3098 + }, + { + "epoch": 0.24, + "grad_norm": 1.3072608650297313, + "learning_rate": 1.7766687202461137e-05, + "loss": 0.6488, + "step": 3099 + }, + { + "epoch": 0.24, + "grad_norm": 1.3758335188078605, + "learning_rate": 1.776510420247432e-05, + "loss": 0.7007, + "step": 3100 + }, + { + "epoch": 0.24, + "grad_norm": 1.2715708150940994, + "learning_rate": 1.7763520712236414e-05, + "loss": 0.6378, + "step": 3101 + }, + { + "epoch": 0.24, + "grad_norm": 1.2376673286673203, + "learning_rate": 1.776193673184739e-05, + "loss": 0.6697, + "step": 3102 + }, + { + "epoch": 0.24, + "grad_norm": 1.293950425623742, + "learning_rate": 1.776035226140725e-05, + "loss": 0.6214, + "step": 3103 + }, + { + "epoch": 0.24, + "grad_norm": 1.1915457221770092, + "learning_rate": 1.7758767301016034e-05, + "loss": 0.6003, + "step": 3104 + }, + { + "epoch": 0.24, + "grad_norm": 1.3364921890612806, + "learning_rate": 1.7757181850773812e-05, + "loss": 0.6393, + "step": 3105 + }, + { + "epoch": 0.24, + "grad_norm": 1.243742012113291, + "learning_rate": 1.7755595910780677e-05, + "loss": 0.625, + "step": 3106 + }, + { + "epoch": 0.24, + "grad_norm": 1.1462533643334123, + "learning_rate": 1.7754009481136758e-05, + "loss": 0.5895, + "step": 3107 + }, + { + "epoch": 0.24, + "grad_norm": 1.2663942518848426, + "learning_rate": 1.7752422561942216e-05, + "loss": 0.6783, + "step": 3108 + }, + { + "epoch": 0.24, + "grad_norm": 1.2311750534406798, + "learning_rate": 1.7750835153297238e-05, + "loss": 0.6015, + "step": 3109 + }, + { + "epoch": 0.24, + "grad_norm": 1.1748801291985216, + "learning_rate": 1.774924725530205e-05, + "loss": 0.579, + "step": 3110 + }, + { + "epoch": 0.24, + "grad_norm": 1.225639919596761, + "learning_rate": 1.77476588680569e-05, + "loss": 0.6048, + "step": 3111 + }, + { + "epoch": 0.24, + "grad_norm": 1.0920129879434115, + "learning_rate": 1.7746069991662076e-05, + "loss": 0.588, + "step": 3112 + }, + { + "epoch": 0.24, + "grad_norm": 1.2202342849654233, + "learning_rate": 1.7744480626217886e-05, + "loss": 0.6283, + "step": 3113 + }, + { + "epoch": 0.24, + "grad_norm": 1.2688035963041178, + "learning_rate": 1.774289077182468e-05, + "loss": 0.6084, + "step": 3114 + }, + { + "epoch": 0.24, + "grad_norm": 1.2619185629018865, + "learning_rate": 1.7741300428582827e-05, + "loss": 0.5455, + "step": 3115 + }, + { + "epoch": 0.24, + "grad_norm": 1.106099141066414, + "learning_rate": 1.7739709596592742e-05, + "loss": 0.5378, + "step": 3116 + }, + { + "epoch": 0.24, + "grad_norm": 1.2088591922171792, + "learning_rate": 1.773811827595486e-05, + "loss": 0.5974, + "step": 3117 + }, + { + "epoch": 0.24, + "grad_norm": 1.2637047977079623, + "learning_rate": 1.7736526466769645e-05, + "loss": 0.6625, + "step": 3118 + }, + { + "epoch": 0.24, + "grad_norm": 1.3631647710582162, + "learning_rate": 1.77349341691376e-05, + "loss": 0.6746, + "step": 3119 + }, + { + "epoch": 0.24, + "grad_norm": 1.2091894022151295, + "learning_rate": 1.7733341383159254e-05, + "loss": 0.5602, + "step": 3120 + }, + { + "epoch": 0.24, + "grad_norm": 1.3337131743991497, + "learning_rate": 1.773174810893517e-05, + "loss": 0.6713, + "step": 3121 + }, + { + "epoch": 0.24, + "grad_norm": 1.1644816316111655, + "learning_rate": 1.7730154346565932e-05, + "loss": 0.6071, + "step": 3122 + }, + { + "epoch": 0.24, + "grad_norm": 1.2607845000857336, + "learning_rate": 1.772856009615217e-05, + "loss": 0.6426, + "step": 3123 + }, + { + "epoch": 0.24, + "grad_norm": 1.1358324508682223, + "learning_rate": 1.7726965357794536e-05, + "loss": 0.5585, + "step": 3124 + }, + { + "epoch": 0.24, + "grad_norm": 1.2403745557005192, + "learning_rate": 1.7725370131593713e-05, + "loss": 0.6094, + "step": 3125 + }, + { + "epoch": 0.24, + "grad_norm": 1.174291027532417, + "learning_rate": 1.7723774417650415e-05, + "loss": 0.602, + "step": 3126 + }, + { + "epoch": 0.24, + "grad_norm": 1.2481000288985398, + "learning_rate": 1.772217821606539e-05, + "loss": 0.6371, + "step": 3127 + }, + { + "epoch": 0.24, + "grad_norm": 1.3395270148383536, + "learning_rate": 1.7720581526939412e-05, + "loss": 0.6196, + "step": 3128 + }, + { + "epoch": 0.24, + "grad_norm": 1.2598934138341693, + "learning_rate": 1.771898435037329e-05, + "loss": 0.5804, + "step": 3129 + }, + { + "epoch": 0.24, + "grad_norm": 1.3108867539960758, + "learning_rate": 1.7717386686467856e-05, + "loss": 0.5992, + "step": 3130 + }, + { + "epoch": 0.24, + "grad_norm": 1.355990526543217, + "learning_rate": 1.771578853532399e-05, + "loss": 0.6602, + "step": 3131 + }, + { + "epoch": 0.24, + "grad_norm": 1.2479598562741128, + "learning_rate": 1.7714189897042583e-05, + "loss": 0.6633, + "step": 3132 + }, + { + "epoch": 0.24, + "grad_norm": 1.3442980070248483, + "learning_rate": 1.7712590771724572e-05, + "loss": 0.6271, + "step": 3133 + }, + { + "epoch": 0.24, + "grad_norm": 1.2984706300624225, + "learning_rate": 1.771099115947091e-05, + "loss": 0.6151, + "step": 3134 + }, + { + "epoch": 0.24, + "grad_norm": 1.2461555967857512, + "learning_rate": 1.7709391060382592e-05, + "loss": 0.6417, + "step": 3135 + }, + { + "epoch": 0.24, + "grad_norm": 1.3102342030781724, + "learning_rate": 1.7707790474560644e-05, + "loss": 0.632, + "step": 3136 + }, + { + "epoch": 0.24, + "grad_norm": 1.1246453361949431, + "learning_rate": 1.7706189402106113e-05, + "loss": 0.5425, + "step": 3137 + }, + { + "epoch": 0.24, + "grad_norm": 1.2233872346064572, + "learning_rate": 1.770458784312009e-05, + "loss": 0.6269, + "step": 3138 + }, + { + "epoch": 0.24, + "grad_norm": 1.2542860460125764, + "learning_rate": 1.7702985797703682e-05, + "loss": 0.6131, + "step": 3139 + }, + { + "epoch": 0.24, + "grad_norm": 1.2888452982702112, + "learning_rate": 1.7701383265958042e-05, + "loss": 0.6223, + "step": 3140 + }, + { + "epoch": 0.24, + "grad_norm": 1.1345702641678044, + "learning_rate": 1.7699780247984343e-05, + "loss": 0.6009, + "step": 3141 + }, + { + "epoch": 0.24, + "grad_norm": 1.23611269937668, + "learning_rate": 1.7698176743883786e-05, + "loss": 0.6078, + "step": 3142 + }, + { + "epoch": 0.24, + "grad_norm": 1.2875427072349082, + "learning_rate": 1.769657275375762e-05, + "loss": 0.681, + "step": 3143 + }, + { + "epoch": 0.24, + "grad_norm": 1.1695176383062844, + "learning_rate": 1.7694968277707102e-05, + "loss": 0.582, + "step": 3144 + }, + { + "epoch": 0.24, + "grad_norm": 1.1835082876240008, + "learning_rate": 1.769336331583354e-05, + "loss": 0.6119, + "step": 3145 + }, + { + "epoch": 0.24, + "grad_norm": 1.199357413063099, + "learning_rate": 1.7691757868238256e-05, + "loss": 0.6433, + "step": 3146 + }, + { + "epoch": 0.24, + "grad_norm": 1.1096665979692646, + "learning_rate": 1.7690151935022616e-05, + "loss": 0.5449, + "step": 3147 + }, + { + "epoch": 0.24, + "grad_norm": 1.2888041844340898, + "learning_rate": 1.7688545516288006e-05, + "loss": 0.6345, + "step": 3148 + }, + { + "epoch": 0.24, + "grad_norm": 1.388267146926027, + "learning_rate": 1.768693861213585e-05, + "loss": 0.6499, + "step": 3149 + }, + { + "epoch": 0.24, + "grad_norm": 1.085520053462167, + "learning_rate": 1.76853312226676e-05, + "loss": 0.55, + "step": 3150 + }, + { + "epoch": 0.24, + "grad_norm": 1.1869019457995977, + "learning_rate": 1.768372334798474e-05, + "loss": 0.59, + "step": 3151 + }, + { + "epoch": 0.24, + "grad_norm": 1.2856650229507738, + "learning_rate": 1.7682114988188782e-05, + "loss": 0.5742, + "step": 3152 + }, + { + "epoch": 0.24, + "grad_norm": 1.292939661165538, + "learning_rate": 1.768050614338127e-05, + "loss": 0.6041, + "step": 3153 + }, + { + "epoch": 0.24, + "grad_norm": 1.0942013490462523, + "learning_rate": 1.767889681366378e-05, + "loss": 0.5807, + "step": 3154 + }, + { + "epoch": 0.24, + "grad_norm": 1.208738434508418, + "learning_rate": 1.7677286999137916e-05, + "loss": 0.5863, + "step": 3155 + }, + { + "epoch": 0.24, + "grad_norm": 1.2480825022118012, + "learning_rate": 1.767567669990531e-05, + "loss": 0.5904, + "step": 3156 + }, + { + "epoch": 0.24, + "grad_norm": 1.2206445786741504, + "learning_rate": 1.767406591606764e-05, + "loss": 0.5646, + "step": 3157 + }, + { + "epoch": 0.24, + "grad_norm": 1.236043791896361, + "learning_rate": 1.767245464772659e-05, + "loss": 0.5646, + "step": 3158 + }, + { + "epoch": 0.25, + "grad_norm": 1.17166435891725, + "learning_rate": 1.7670842894983898e-05, + "loss": 0.6143, + "step": 3159 + }, + { + "epoch": 0.25, + "grad_norm": 1.1660721092664443, + "learning_rate": 1.7669230657941314e-05, + "loss": 0.6108, + "step": 3160 + }, + { + "epoch": 0.25, + "grad_norm": 1.3440181109848686, + "learning_rate": 1.7667617936700634e-05, + "loss": 0.6317, + "step": 3161 + }, + { + "epoch": 0.25, + "grad_norm": 1.2503476136379494, + "learning_rate": 1.766600473136367e-05, + "loss": 0.6245, + "step": 3162 + }, + { + "epoch": 0.25, + "grad_norm": 1.2463578090888154, + "learning_rate": 1.7664391042032277e-05, + "loss": 0.6237, + "step": 3163 + }, + { + "epoch": 0.25, + "grad_norm": 1.3054172964588686, + "learning_rate": 1.766277686880834e-05, + "loss": 0.6322, + "step": 3164 + }, + { + "epoch": 0.25, + "grad_norm": 1.2412940116831916, + "learning_rate": 1.7661162211793757e-05, + "loss": 0.6475, + "step": 3165 + }, + { + "epoch": 0.25, + "grad_norm": 1.227847355721397, + "learning_rate": 1.765954707109048e-05, + "loss": 0.6049, + "step": 3166 + }, + { + "epoch": 0.25, + "grad_norm": 1.2741661093648053, + "learning_rate": 1.7657931446800475e-05, + "loss": 0.6355, + "step": 3167 + }, + { + "epoch": 0.25, + "grad_norm": 1.2351243364239475, + "learning_rate": 1.7656315339025754e-05, + "loss": 0.6611, + "step": 3168 + }, + { + "epoch": 0.25, + "grad_norm": 1.293660650070407, + "learning_rate": 1.765469874786834e-05, + "loss": 0.5834, + "step": 3169 + }, + { + "epoch": 0.25, + "grad_norm": 1.28034629585367, + "learning_rate": 1.7653081673430302e-05, + "loss": 0.6123, + "step": 3170 + }, + { + "epoch": 0.25, + "grad_norm": 1.161525137082055, + "learning_rate": 1.7651464115813738e-05, + "loss": 0.5789, + "step": 3171 + }, + { + "epoch": 0.25, + "grad_norm": 1.3447536446386683, + "learning_rate": 1.764984607512076e-05, + "loss": 0.6256, + "step": 3172 + }, + { + "epoch": 0.25, + "grad_norm": 1.189310450523712, + "learning_rate": 1.7648227551453535e-05, + "loss": 0.5704, + "step": 3173 + }, + { + "epoch": 0.25, + "grad_norm": 1.1945091490013942, + "learning_rate": 1.7646608544914245e-05, + "loss": 0.6375, + "step": 3174 + }, + { + "epoch": 0.25, + "grad_norm": 1.1590937620355648, + "learning_rate": 1.764498905560511e-05, + "loss": 0.6081, + "step": 3175 + }, + { + "epoch": 0.25, + "grad_norm": 1.2789708311708203, + "learning_rate": 1.764336908362837e-05, + "loss": 0.6274, + "step": 3176 + }, + { + "epoch": 0.25, + "grad_norm": 1.224904050767292, + "learning_rate": 1.76417486290863e-05, + "loss": 0.5833, + "step": 3177 + }, + { + "epoch": 0.25, + "grad_norm": 1.2411030768253288, + "learning_rate": 1.764012769208122e-05, + "loss": 0.6002, + "step": 3178 + }, + { + "epoch": 0.25, + "grad_norm": 1.1292730349356888, + "learning_rate": 1.7638506272715458e-05, + "loss": 0.5581, + "step": 3179 + }, + { + "epoch": 0.25, + "grad_norm": 1.231072656087877, + "learning_rate": 1.7636884371091385e-05, + "loss": 0.6251, + "step": 3180 + }, + { + "epoch": 0.25, + "grad_norm": 1.1978900519468991, + "learning_rate": 1.76352619873114e-05, + "loss": 0.6132, + "step": 3181 + }, + { + "epoch": 0.25, + "grad_norm": 1.121095185494916, + "learning_rate": 1.7633639121477935e-05, + "loss": 0.5977, + "step": 3182 + }, + { + "epoch": 0.25, + "grad_norm": 1.1506382104217876, + "learning_rate": 1.7632015773693444e-05, + "loss": 0.6299, + "step": 3183 + }, + { + "epoch": 0.25, + "grad_norm": 1.1663758505802897, + "learning_rate": 1.7630391944060424e-05, + "loss": 0.6218, + "step": 3184 + }, + { + "epoch": 0.25, + "grad_norm": 1.2367705266238813, + "learning_rate": 1.7628767632681393e-05, + "loss": 0.622, + "step": 3185 + }, + { + "epoch": 0.25, + "grad_norm": 1.2750108045232404, + "learning_rate": 1.7627142839658903e-05, + "loss": 0.6443, + "step": 3186 + }, + { + "epoch": 0.25, + "grad_norm": 1.267974838458455, + "learning_rate": 1.7625517565095536e-05, + "loss": 0.6656, + "step": 3187 + }, + { + "epoch": 0.25, + "grad_norm": 1.2187938682290484, + "learning_rate": 1.76238918090939e-05, + "loss": 0.613, + "step": 3188 + }, + { + "epoch": 0.25, + "grad_norm": 1.2599129524134998, + "learning_rate": 1.762226557175664e-05, + "loss": 0.6017, + "step": 3189 + }, + { + "epoch": 0.25, + "grad_norm": 1.1959538453328042, + "learning_rate": 1.762063885318643e-05, + "loss": 0.581, + "step": 3190 + }, + { + "epoch": 0.25, + "grad_norm": 1.2414900063071381, + "learning_rate": 1.7619011653485968e-05, + "loss": 0.5808, + "step": 3191 + }, + { + "epoch": 0.25, + "grad_norm": 1.2167196233973314, + "learning_rate": 1.7617383972758e-05, + "loss": 0.6234, + "step": 3192 + }, + { + "epoch": 0.25, + "grad_norm": 1.1507774441015806, + "learning_rate": 1.761575581110527e-05, + "loss": 0.5854, + "step": 3193 + }, + { + "epoch": 0.25, + "grad_norm": 1.314947842916153, + "learning_rate": 1.7614127168630594e-05, + "loss": 0.6175, + "step": 3194 + }, + { + "epoch": 0.25, + "grad_norm": 1.123485446378992, + "learning_rate": 1.7612498045436778e-05, + "loss": 0.5849, + "step": 3195 + }, + { + "epoch": 0.25, + "grad_norm": 1.1255848212222526, + "learning_rate": 1.761086844162669e-05, + "loss": 0.5876, + "step": 3196 + }, + { + "epoch": 0.25, + "grad_norm": 1.1940461475457604, + "learning_rate": 1.7609238357303208e-05, + "loss": 0.6501, + "step": 3197 + }, + { + "epoch": 0.25, + "grad_norm": 1.233503347784292, + "learning_rate": 1.760760779256925e-05, + "loss": 0.6747, + "step": 3198 + }, + { + "epoch": 0.25, + "grad_norm": 1.3401509369349782, + "learning_rate": 1.7605976747527763e-05, + "loss": 0.6246, + "step": 3199 + }, + { + "epoch": 0.25, + "grad_norm": 1.2139890061843788, + "learning_rate": 1.760434522228172e-05, + "loss": 0.5566, + "step": 3200 + }, + { + "epoch": 0.25, + "grad_norm": 1.2199347190901457, + "learning_rate": 1.7602713216934132e-05, + "loss": 0.6276, + "step": 3201 + }, + { + "epoch": 0.25, + "grad_norm": 1.1978119790593345, + "learning_rate": 1.7601080731588034e-05, + "loss": 0.611, + "step": 3202 + }, + { + "epoch": 0.25, + "grad_norm": 1.2223582041599093, + "learning_rate": 1.7599447766346494e-05, + "loss": 0.6594, + "step": 3203 + }, + { + "epoch": 0.25, + "grad_norm": 1.2071774094861256, + "learning_rate": 1.7597814321312605e-05, + "loss": 0.6109, + "step": 3204 + }, + { + "epoch": 0.25, + "grad_norm": 1.1836781487416035, + "learning_rate": 1.75961803965895e-05, + "loss": 0.5251, + "step": 3205 + }, + { + "epoch": 0.25, + "grad_norm": 1.3086132902138354, + "learning_rate": 1.7594545992280336e-05, + "loss": 0.5832, + "step": 3206 + }, + { + "epoch": 0.25, + "grad_norm": 1.1603592717457671, + "learning_rate": 1.75929111084883e-05, + "loss": 0.6271, + "step": 3207 + }, + { + "epoch": 0.25, + "grad_norm": 1.247102621999723, + "learning_rate": 1.7591275745316613e-05, + "loss": 0.6509, + "step": 3208 + }, + { + "epoch": 0.25, + "grad_norm": 1.1599715885431923, + "learning_rate": 1.7589639902868524e-05, + "loss": 0.5862, + "step": 3209 + }, + { + "epoch": 0.25, + "grad_norm": 1.2619552626834056, + "learning_rate": 1.7588003581247307e-05, + "loss": 0.6285, + "step": 3210 + }, + { + "epoch": 0.25, + "grad_norm": 1.265983719086781, + "learning_rate": 1.7586366780556275e-05, + "loss": 0.6405, + "step": 3211 + }, + { + "epoch": 0.25, + "grad_norm": 1.1950286827475802, + "learning_rate": 1.7584729500898768e-05, + "loss": 0.5574, + "step": 3212 + }, + { + "epoch": 0.25, + "grad_norm": 1.270312447448204, + "learning_rate": 1.7583091742378157e-05, + "loss": 0.6536, + "step": 3213 + }, + { + "epoch": 0.25, + "grad_norm": 1.1762339078671982, + "learning_rate": 1.7581453505097838e-05, + "loss": 0.6238, + "step": 3214 + }, + { + "epoch": 0.25, + "grad_norm": 1.248898402227216, + "learning_rate": 1.7579814789161246e-05, + "loss": 0.6491, + "step": 3215 + }, + { + "epoch": 0.25, + "grad_norm": 1.2109732837927323, + "learning_rate": 1.757817559467184e-05, + "loss": 0.5537, + "step": 3216 + }, + { + "epoch": 0.25, + "grad_norm": 1.1729884135299256, + "learning_rate": 1.757653592173311e-05, + "loss": 0.6663, + "step": 3217 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556395627549655, + "learning_rate": 1.7574895770448578e-05, + "loss": 0.5797, + "step": 3218 + }, + { + "epoch": 0.25, + "grad_norm": 1.195463881536923, + "learning_rate": 1.7573255140921793e-05, + "loss": 0.6031, + "step": 3219 + }, + { + "epoch": 0.25, + "grad_norm": 1.2584785451321943, + "learning_rate": 1.7571614033256338e-05, + "loss": 0.6481, + "step": 3220 + }, + { + "epoch": 0.25, + "grad_norm": 1.1928259223820412, + "learning_rate": 1.7569972447555827e-05, + "loss": 0.5866, + "step": 3221 + }, + { + "epoch": 0.25, + "grad_norm": 1.2776793059584677, + "learning_rate": 1.7568330383923902e-05, + "loss": 0.6228, + "step": 3222 + }, + { + "epoch": 0.25, + "grad_norm": 1.1727822924128048, + "learning_rate": 1.7566687842464224e-05, + "loss": 0.5913, + "step": 3223 + }, + { + "epoch": 0.25, + "grad_norm": 1.2486822812159255, + "learning_rate": 1.756504482328051e-05, + "loss": 0.6199, + "step": 3224 + }, + { + "epoch": 0.25, + "grad_norm": 1.269318689011064, + "learning_rate": 1.7563401326476484e-05, + "loss": 0.5851, + "step": 3225 + }, + { + "epoch": 0.25, + "grad_norm": 1.2689404330600882, + "learning_rate": 1.7561757352155914e-05, + "loss": 0.6424, + "step": 3226 + }, + { + "epoch": 0.25, + "grad_norm": 1.2317719355357146, + "learning_rate": 1.7560112900422583e-05, + "loss": 0.6589, + "step": 3227 + }, + { + "epoch": 0.25, + "grad_norm": 1.3138338305062551, + "learning_rate": 1.7558467971380323e-05, + "loss": 0.5873, + "step": 3228 + }, + { + "epoch": 0.25, + "grad_norm": 1.1362952003679108, + "learning_rate": 1.755682256513298e-05, + "loss": 0.5645, + "step": 3229 + }, + { + "epoch": 0.25, + "grad_norm": 1.1628404898018452, + "learning_rate": 1.755517668178444e-05, + "loss": 0.6042, + "step": 3230 + }, + { + "epoch": 0.25, + "grad_norm": 1.1885953168381038, + "learning_rate": 1.7553530321438625e-05, + "loss": 0.5527, + "step": 3231 + }, + { + "epoch": 0.25, + "grad_norm": 1.2439340273177062, + "learning_rate": 1.755188348419946e-05, + "loss": 0.6827, + "step": 3232 + }, + { + "epoch": 0.25, + "grad_norm": 1.3462039364413905, + "learning_rate": 1.755023617017093e-05, + "loss": 0.6536, + "step": 3233 + }, + { + "epoch": 0.25, + "grad_norm": 1.3549725946420983, + "learning_rate": 1.7548588379457042e-05, + "loss": 0.6188, + "step": 3234 + }, + { + "epoch": 0.25, + "grad_norm": 1.3044959287422724, + "learning_rate": 1.754694011216182e-05, + "loss": 0.6692, + "step": 3235 + }, + { + "epoch": 0.25, + "grad_norm": 1.127275232465399, + "learning_rate": 1.754529136838933e-05, + "loss": 0.5306, + "step": 3236 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800491587049007, + "learning_rate": 1.7543642148243672e-05, + "loss": 0.6129, + "step": 3237 + }, + { + "epoch": 0.25, + "grad_norm": 1.2309093352613774, + "learning_rate": 1.754199245182896e-05, + "loss": 0.623, + "step": 3238 + }, + { + "epoch": 0.25, + "grad_norm": 1.1864438128343366, + "learning_rate": 1.7540342279249355e-05, + "loss": 0.5829, + "step": 3239 + }, + { + "epoch": 0.25, + "grad_norm": 1.1907681116316102, + "learning_rate": 1.753869163060904e-05, + "loss": 0.6444, + "step": 3240 + }, + { + "epoch": 0.25, + "grad_norm": 1.1033147308321871, + "learning_rate": 1.753704050601223e-05, + "loss": 0.6013, + "step": 3241 + }, + { + "epoch": 0.25, + "grad_norm": 1.119433033608672, + "learning_rate": 1.7535388905563163e-05, + "loss": 0.5642, + "step": 3242 + }, + { + "epoch": 0.25, + "grad_norm": 1.1216088683289283, + "learning_rate": 1.7533736829366116e-05, + "loss": 0.5404, + "step": 3243 + }, + { + "epoch": 0.25, + "grad_norm": 1.3375103263812862, + "learning_rate": 1.7532084277525396e-05, + "loss": 0.6429, + "step": 3244 + }, + { + "epoch": 0.25, + "grad_norm": 1.2044718251208995, + "learning_rate": 1.7530431250145335e-05, + "loss": 0.63, + "step": 3245 + }, + { + "epoch": 0.25, + "grad_norm": 1.224135409051966, + "learning_rate": 1.7528777747330296e-05, + "loss": 0.6245, + "step": 3246 + }, + { + "epoch": 0.25, + "grad_norm": 1.1316154205917992, + "learning_rate": 1.7527123769184676e-05, + "loss": 0.5946, + "step": 3247 + }, + { + "epoch": 0.25, + "grad_norm": 1.238204713014442, + "learning_rate": 1.75254693158129e-05, + "loss": 0.5939, + "step": 3248 + }, + { + "epoch": 0.25, + "grad_norm": 1.1776740468327798, + "learning_rate": 1.7523814387319413e-05, + "loss": 0.5823, + "step": 3249 + }, + { + "epoch": 0.25, + "grad_norm": 1.1853791421449613, + "learning_rate": 1.752215898380871e-05, + "loss": 0.5543, + "step": 3250 + }, + { + "epoch": 0.25, + "grad_norm": 1.3081901269439555, + "learning_rate": 1.7520503105385303e-05, + "loss": 0.6349, + "step": 3251 + }, + { + "epoch": 0.25, + "grad_norm": 1.2068558846396564, + "learning_rate": 1.7518846752153732e-05, + "loss": 0.6398, + "step": 3252 + }, + { + "epoch": 0.25, + "grad_norm": 1.1487598615519865, + "learning_rate": 1.7517189924218573e-05, + "loss": 0.5859, + "step": 3253 + }, + { + "epoch": 0.25, + "grad_norm": 1.2408946765128526, + "learning_rate": 1.751553262168443e-05, + "loss": 0.6284, + "step": 3254 + }, + { + "epoch": 0.25, + "grad_norm": 1.252214425323239, + "learning_rate": 1.751387484465594e-05, + "loss": 0.602, + "step": 3255 + }, + { + "epoch": 0.25, + "grad_norm": 1.2074338371205104, + "learning_rate": 1.751221659323776e-05, + "loss": 0.592, + "step": 3256 + }, + { + "epoch": 0.25, + "grad_norm": 1.1190562058276843, + "learning_rate": 1.7510557867534594e-05, + "loss": 0.5727, + "step": 3257 + }, + { + "epoch": 0.25, + "grad_norm": 1.1997168186954494, + "learning_rate": 1.7508898667651158e-05, + "loss": 0.6496, + "step": 3258 + }, + { + "epoch": 0.25, + "grad_norm": 1.188626507944179, + "learning_rate": 1.7507238993692207e-05, + "loss": 0.6299, + "step": 3259 + }, + { + "epoch": 0.25, + "grad_norm": 1.1822751698064238, + "learning_rate": 1.750557884576253e-05, + "loss": 0.6066, + "step": 3260 + }, + { + "epoch": 0.25, + "grad_norm": 1.3028565768414537, + "learning_rate": 1.7503918223966932e-05, + "loss": 0.6039, + "step": 3261 + }, + { + "epoch": 0.25, + "grad_norm": 1.1614333294057304, + "learning_rate": 1.7502257128410265e-05, + "loss": 0.6381, + "step": 3262 + }, + { + "epoch": 0.25, + "grad_norm": 1.1393542069270943, + "learning_rate": 1.75005955591974e-05, + "loss": 0.5498, + "step": 3263 + }, + { + "epoch": 0.25, + "grad_norm": 1.231385340731515, + "learning_rate": 1.7498933516433233e-05, + "loss": 0.6238, + "step": 3264 + }, + { + "epoch": 0.25, + "grad_norm": 1.1913341062621896, + "learning_rate": 1.749727100022271e-05, + "loss": 0.6228, + "step": 3265 + }, + { + "epoch": 0.25, + "grad_norm": 1.2985829057310687, + "learning_rate": 1.7495608010670783e-05, + "loss": 0.684, + "step": 3266 + }, + { + "epoch": 0.25, + "grad_norm": 1.1821546713263802, + "learning_rate": 1.7493944547882454e-05, + "loss": 0.5988, + "step": 3267 + }, + { + "epoch": 0.25, + "grad_norm": 1.2065675208859925, + "learning_rate": 1.749228061196274e-05, + "loss": 0.5963, + "step": 3268 + }, + { + "epoch": 0.25, + "grad_norm": 1.1694908303288254, + "learning_rate": 1.7490616203016696e-05, + "loss": 0.6042, + "step": 3269 + }, + { + "epoch": 0.25, + "grad_norm": 1.201746807671075, + "learning_rate": 1.7488951321149405e-05, + "loss": 0.6107, + "step": 3270 + }, + { + "epoch": 0.25, + "grad_norm": 1.2412208298619856, + "learning_rate": 1.7487285966465982e-05, + "loss": 0.6231, + "step": 3271 + }, + { + "epoch": 0.25, + "grad_norm": 1.1691350822686961, + "learning_rate": 1.7485620139071564e-05, + "loss": 0.5284, + "step": 3272 + }, + { + "epoch": 0.25, + "grad_norm": 1.1960277536674158, + "learning_rate": 1.7483953839071324e-05, + "loss": 0.614, + "step": 3273 + }, + { + "epoch": 0.25, + "grad_norm": 1.251885184647968, + "learning_rate": 1.748228706657047e-05, + "loss": 0.6663, + "step": 3274 + }, + { + "epoch": 0.25, + "grad_norm": 1.1440939735182525, + "learning_rate": 1.7480619821674226e-05, + "loss": 0.5752, + "step": 3275 + }, + { + "epoch": 0.25, + "grad_norm": 1.1080195783544828, + "learning_rate": 1.747895210448786e-05, + "loss": 0.5984, + "step": 3276 + }, + { + "epoch": 0.25, + "grad_norm": 1.2818116143456204, + "learning_rate": 1.747728391511666e-05, + "loss": 0.6211, + "step": 3277 + }, + { + "epoch": 0.25, + "grad_norm": 1.1357567245851952, + "learning_rate": 1.747561525366595e-05, + "loss": 0.5827, + "step": 3278 + }, + { + "epoch": 0.25, + "grad_norm": 1.277899245118622, + "learning_rate": 1.747394612024108e-05, + "loss": 0.6351, + "step": 3279 + }, + { + "epoch": 0.25, + "grad_norm": 1.2762655204554059, + "learning_rate": 1.747227651494743e-05, + "loss": 0.618, + "step": 3280 + }, + { + "epoch": 0.25, + "grad_norm": 1.2271707083784782, + "learning_rate": 1.747060643789041e-05, + "loss": 0.5917, + "step": 3281 + }, + { + "epoch": 0.25, + "grad_norm": 1.242839809901325, + "learning_rate": 1.7468935889175466e-05, + "loss": 0.5971, + "step": 3282 + }, + { + "epoch": 0.25, + "grad_norm": 1.2811186188118213, + "learning_rate": 1.7467264868908064e-05, + "loss": 0.6199, + "step": 3283 + }, + { + "epoch": 0.25, + "grad_norm": 1.1961908541480428, + "learning_rate": 1.7465593377193704e-05, + "loss": 0.6229, + "step": 3284 + }, + { + "epoch": 0.25, + "grad_norm": 1.355854518203587, + "learning_rate": 1.7463921414137916e-05, + "loss": 0.6819, + "step": 3285 + }, + { + "epoch": 0.25, + "grad_norm": 1.2883341263097878, + "learning_rate": 1.746224897984626e-05, + "loss": 0.5962, + "step": 3286 + }, + { + "epoch": 0.25, + "grad_norm": 1.2496667894658595, + "learning_rate": 1.7460576074424327e-05, + "loss": 0.6196, + "step": 3287 + }, + { + "epoch": 0.26, + "grad_norm": 1.3655899741702564, + "learning_rate": 1.745890269797774e-05, + "loss": 0.6604, + "step": 3288 + }, + { + "epoch": 0.26, + "grad_norm": 1.0832631504115473, + "learning_rate": 1.7457228850612132e-05, + "loss": 0.6096, + "step": 3289 + }, + { + "epoch": 0.26, + "grad_norm": 1.1328132366309567, + "learning_rate": 1.7455554532433198e-05, + "loss": 0.6114, + "step": 3290 + }, + { + "epoch": 0.26, + "grad_norm": 1.136917568791836, + "learning_rate": 1.745387974354664e-05, + "loss": 0.5363, + "step": 3291 + }, + { + "epoch": 0.26, + "grad_norm": 1.2180852789002845, + "learning_rate": 1.74522044840582e-05, + "loss": 0.5695, + "step": 3292 + }, + { + "epoch": 0.26, + "grad_norm": 1.145202179678769, + "learning_rate": 1.7450528754073638e-05, + "loss": 0.5848, + "step": 3293 + }, + { + "epoch": 0.26, + "grad_norm": 1.2232268832993016, + "learning_rate": 1.744885255369876e-05, + "loss": 0.5984, + "step": 3294 + }, + { + "epoch": 0.26, + "grad_norm": 1.1450380108283569, + "learning_rate": 1.7447175883039386e-05, + "loss": 0.6363, + "step": 3295 + }, + { + "epoch": 0.26, + "grad_norm": 1.3036661318962648, + "learning_rate": 1.744549874220138e-05, + "loss": 0.6107, + "step": 3296 + }, + { + "epoch": 0.26, + "grad_norm": 1.2884760129336736, + "learning_rate": 1.744382113129062e-05, + "loss": 0.6404, + "step": 3297 + }, + { + "epoch": 0.26, + "grad_norm": 1.2740058800637986, + "learning_rate": 1.744214305041303e-05, + "loss": 0.6113, + "step": 3298 + }, + { + "epoch": 0.26, + "grad_norm": 1.1967480086642135, + "learning_rate": 1.7440464499674553e-05, + "loss": 0.6188, + "step": 3299 + }, + { + "epoch": 0.26, + "grad_norm": 1.3453225980550259, + "learning_rate": 1.7438785479181164e-05, + "loss": 0.6536, + "step": 3300 + }, + { + "epoch": 0.26, + "grad_norm": 1.2878115045682026, + "learning_rate": 1.7437105989038868e-05, + "loss": 0.6355, + "step": 3301 + }, + { + "epoch": 0.26, + "grad_norm": 1.171885833690157, + "learning_rate": 1.74354260293537e-05, + "loss": 0.5886, + "step": 3302 + }, + { + "epoch": 0.26, + "grad_norm": 1.4211772841721313, + "learning_rate": 1.7433745600231726e-05, + "loss": 0.6045, + "step": 3303 + }, + { + "epoch": 0.26, + "grad_norm": 1.3650233551794055, + "learning_rate": 1.743206470177904e-05, + "loss": 0.6654, + "step": 3304 + }, + { + "epoch": 0.26, + "grad_norm": 1.1674912921654297, + "learning_rate": 1.743038333410176e-05, + "loss": 0.6003, + "step": 3305 + }, + { + "epoch": 0.26, + "grad_norm": 1.2374112415272294, + "learning_rate": 1.7428701497306048e-05, + "loss": 0.6522, + "step": 3306 + }, + { + "epoch": 0.26, + "grad_norm": 1.2440135657377303, + "learning_rate": 1.7427019191498086e-05, + "loss": 0.576, + "step": 3307 + }, + { + "epoch": 0.26, + "grad_norm": 1.1372783109504145, + "learning_rate": 1.7425336416784082e-05, + "loss": 0.621, + "step": 3308 + }, + { + "epoch": 0.26, + "grad_norm": 1.322163114419776, + "learning_rate": 1.7423653173270278e-05, + "loss": 0.6274, + "step": 3309 + }, + { + "epoch": 0.26, + "grad_norm": 1.201498393517673, + "learning_rate": 1.742196946106295e-05, + "loss": 0.5946, + "step": 3310 + }, + { + "epoch": 0.26, + "grad_norm": 1.160962835885136, + "learning_rate": 1.74202852802684e-05, + "loss": 0.594, + "step": 3311 + }, + { + "epoch": 0.26, + "grad_norm": 1.2198264430056878, + "learning_rate": 1.741860063099295e-05, + "loss": 0.5602, + "step": 3312 + }, + { + "epoch": 0.26, + "grad_norm": 1.215235116603408, + "learning_rate": 1.7416915513342973e-05, + "loss": 0.5971, + "step": 3313 + }, + { + "epoch": 0.26, + "grad_norm": 1.2221801123204383, + "learning_rate": 1.7415229927424853e-05, + "loss": 0.6551, + "step": 3314 + }, + { + "epoch": 0.26, + "grad_norm": 1.10844995320124, + "learning_rate": 1.741354387334501e-05, + "loss": 0.5566, + "step": 3315 + }, + { + "epoch": 0.26, + "grad_norm": 1.2094471281558543, + "learning_rate": 1.741185735120989e-05, + "loss": 0.6702, + "step": 3316 + }, + { + "epoch": 0.26, + "grad_norm": 1.2872892725812624, + "learning_rate": 1.7410170361125978e-05, + "loss": 0.6408, + "step": 3317 + }, + { + "epoch": 0.26, + "grad_norm": 1.125605685054057, + "learning_rate": 1.740848290319978e-05, + "loss": 0.5682, + "step": 3318 + }, + { + "epoch": 0.26, + "grad_norm": 1.3357768268893488, + "learning_rate": 1.7406794977537832e-05, + "loss": 0.6534, + "step": 3319 + }, + { + "epoch": 0.26, + "grad_norm": 1.2486897754370647, + "learning_rate": 1.7405106584246705e-05, + "loss": 0.6187, + "step": 3320 + }, + { + "epoch": 0.26, + "grad_norm": 1.1170026186038942, + "learning_rate": 1.740341772343299e-05, + "loss": 0.5665, + "step": 3321 + }, + { + "epoch": 0.26, + "grad_norm": 1.1826839664420352, + "learning_rate": 1.7401728395203323e-05, + "loss": 0.5946, + "step": 3322 + }, + { + "epoch": 0.26, + "grad_norm": 1.081077954490754, + "learning_rate": 1.7400038599664354e-05, + "loss": 0.5587, + "step": 3323 + }, + { + "epoch": 0.26, + "grad_norm": 1.3094195137560254, + "learning_rate": 1.7398348336922764e-05, + "loss": 0.6296, + "step": 3324 + }, + { + "epoch": 0.26, + "grad_norm": 1.2678910678799522, + "learning_rate": 1.7396657607085276e-05, + "loss": 0.6614, + "step": 3325 + }, + { + "epoch": 0.26, + "grad_norm": 1.2309330624308885, + "learning_rate": 1.739496641025863e-05, + "loss": 0.6511, + "step": 3326 + }, + { + "epoch": 0.26, + "grad_norm": 1.1810842670252255, + "learning_rate": 1.7393274746549605e-05, + "loss": 0.6624, + "step": 3327 + }, + { + "epoch": 0.26, + "grad_norm": 1.0927437649568406, + "learning_rate": 1.7391582616064998e-05, + "loss": 0.5657, + "step": 3328 + }, + { + "epoch": 0.26, + "grad_norm": 1.1824833158642718, + "learning_rate": 1.7389890018911647e-05, + "loss": 0.6251, + "step": 3329 + }, + { + "epoch": 0.26, + "grad_norm": 1.2882377066578126, + "learning_rate": 1.738819695519641e-05, + "loss": 0.6431, + "step": 3330 + }, + { + "epoch": 0.26, + "grad_norm": 1.274624464417439, + "learning_rate": 1.7386503425026183e-05, + "loss": 0.6202, + "step": 3331 + }, + { + "epoch": 0.26, + "grad_norm": 1.204040278277466, + "learning_rate": 1.7384809428507884e-05, + "loss": 0.5557, + "step": 3332 + }, + { + "epoch": 0.26, + "grad_norm": 1.197070854237926, + "learning_rate": 1.7383114965748465e-05, + "loss": 0.6166, + "step": 3333 + }, + { + "epoch": 0.26, + "grad_norm": 1.3553243774569124, + "learning_rate": 1.738142003685491e-05, + "loss": 0.7191, + "step": 3334 + }, + { + "epoch": 0.26, + "grad_norm": 1.1880538552308022, + "learning_rate": 1.737972464193422e-05, + "loss": 0.5912, + "step": 3335 + }, + { + "epoch": 0.26, + "grad_norm": 1.2519953061524662, + "learning_rate": 1.7378028781093443e-05, + "loss": 0.6046, + "step": 3336 + }, + { + "epoch": 0.26, + "grad_norm": 1.2172224301772756, + "learning_rate": 1.7376332454439643e-05, + "loss": 0.555, + "step": 3337 + }, + { + "epoch": 0.26, + "grad_norm": 1.2924892769669403, + "learning_rate": 1.7374635662079915e-05, + "loss": 0.6675, + "step": 3338 + }, + { + "epoch": 0.26, + "grad_norm": 1.3047546095349118, + "learning_rate": 1.7372938404121393e-05, + "loss": 0.6306, + "step": 3339 + }, + { + "epoch": 0.26, + "grad_norm": 1.3009045058498545, + "learning_rate": 1.7371240680671228e-05, + "loss": 0.6191, + "step": 3340 + }, + { + "epoch": 0.26, + "grad_norm": 1.2138590854610696, + "learning_rate": 1.7369542491836608e-05, + "loss": 0.5735, + "step": 3341 + }, + { + "epoch": 0.26, + "grad_norm": 1.2044767242374326, + "learning_rate": 1.736784383772475e-05, + "loss": 0.6026, + "step": 3342 + }, + { + "epoch": 0.26, + "grad_norm": 1.1745416944596863, + "learning_rate": 1.7366144718442893e-05, + "loss": 0.6085, + "step": 3343 + }, + { + "epoch": 0.26, + "grad_norm": 1.2627075855576206, + "learning_rate": 1.736444513409832e-05, + "loss": 0.6771, + "step": 3344 + }, + { + "epoch": 0.26, + "grad_norm": 1.1491306833927326, + "learning_rate": 1.736274508479833e-05, + "loss": 0.5387, + "step": 3345 + }, + { + "epoch": 0.26, + "grad_norm": 1.1292567253562806, + "learning_rate": 1.7361044570650256e-05, + "loss": 0.557, + "step": 3346 + }, + { + "epoch": 0.26, + "grad_norm": 1.3335490151641733, + "learning_rate": 1.735934359176146e-05, + "loss": 0.5979, + "step": 3347 + }, + { + "epoch": 0.26, + "grad_norm": 1.1763961552817495, + "learning_rate": 1.7357642148239334e-05, + "loss": 0.5747, + "step": 3348 + }, + { + "epoch": 0.26, + "grad_norm": 1.1725573778245444, + "learning_rate": 1.73559402401913e-05, + "loss": 0.5861, + "step": 3349 + }, + { + "epoch": 0.26, + "grad_norm": 1.1778224829870367, + "learning_rate": 1.7354237867724805e-05, + "loss": 0.5585, + "step": 3350 + }, + { + "epoch": 0.26, + "grad_norm": 1.1484453888706514, + "learning_rate": 1.7352535030947334e-05, + "loss": 0.5919, + "step": 3351 + }, + { + "epoch": 0.26, + "grad_norm": 1.2300903707350819, + "learning_rate": 1.735083172996639e-05, + "loss": 0.6022, + "step": 3352 + }, + { + "epoch": 0.26, + "grad_norm": 1.2197658022454867, + "learning_rate": 1.7349127964889508e-05, + "loss": 0.6532, + "step": 3353 + }, + { + "epoch": 0.26, + "grad_norm": 1.231761676950544, + "learning_rate": 1.7347423735824266e-05, + "loss": 0.6267, + "step": 3354 + }, + { + "epoch": 0.26, + "grad_norm": 1.2859242008675675, + "learning_rate": 1.734571904287826e-05, + "loss": 0.6163, + "step": 3355 + }, + { + "epoch": 0.26, + "grad_norm": 1.178086514438827, + "learning_rate": 1.7344013886159104e-05, + "loss": 0.617, + "step": 3356 + }, + { + "epoch": 0.26, + "grad_norm": 1.3033751323140503, + "learning_rate": 1.7342308265774467e-05, + "loss": 0.6533, + "step": 3357 + }, + { + "epoch": 0.26, + "grad_norm": 1.2567069366100936, + "learning_rate": 1.7340602181832028e-05, + "loss": 0.6165, + "step": 3358 + }, + { + "epoch": 0.26, + "grad_norm": 1.2007666920143658, + "learning_rate": 1.7338895634439496e-05, + "loss": 0.6301, + "step": 3359 + }, + { + "epoch": 0.26, + "grad_norm": 1.2706204480991743, + "learning_rate": 1.733718862370462e-05, + "loss": 0.6285, + "step": 3360 + }, + { + "epoch": 0.26, + "grad_norm": 1.3147399498714427, + "learning_rate": 1.7335481149735173e-05, + "loss": 0.6519, + "step": 3361 + }, + { + "epoch": 0.26, + "grad_norm": 1.228480934417426, + "learning_rate": 1.7333773212638957e-05, + "loss": 0.5986, + "step": 3362 + }, + { + "epoch": 0.26, + "grad_norm": 1.1688612777855818, + "learning_rate": 1.73320648125238e-05, + "loss": 0.5989, + "step": 3363 + }, + { + "epoch": 0.26, + "grad_norm": 1.1706408296035187, + "learning_rate": 1.733035594949756e-05, + "loss": 0.5714, + "step": 3364 + }, + { + "epoch": 0.26, + "grad_norm": 1.184445367499233, + "learning_rate": 1.732864662366813e-05, + "loss": 0.5902, + "step": 3365 + }, + { + "epoch": 0.26, + "grad_norm": 1.2335445652748194, + "learning_rate": 1.7326936835143427e-05, + "loss": 0.5393, + "step": 3366 + }, + { + "epoch": 0.26, + "grad_norm": 1.2273502128717442, + "learning_rate": 1.73252265840314e-05, + "loss": 0.6452, + "step": 3367 + }, + { + "epoch": 0.26, + "grad_norm": 1.2024560034429803, + "learning_rate": 1.7323515870440027e-05, + "loss": 0.626, + "step": 3368 + }, + { + "epoch": 0.26, + "grad_norm": 1.094234086947422, + "learning_rate": 1.7321804694477314e-05, + "loss": 0.5842, + "step": 3369 + }, + { + "epoch": 0.26, + "grad_norm": 1.2529117050723875, + "learning_rate": 1.7320093056251293e-05, + "loss": 0.6102, + "step": 3370 + }, + { + "epoch": 0.26, + "grad_norm": 1.1553890528349962, + "learning_rate": 1.7318380955870032e-05, + "loss": 0.5824, + "step": 3371 + }, + { + "epoch": 0.26, + "grad_norm": 1.1919226856024154, + "learning_rate": 1.7316668393441622e-05, + "loss": 0.6054, + "step": 3372 + }, + { + "epoch": 0.26, + "grad_norm": 1.3072211967208809, + "learning_rate": 1.731495536907419e-05, + "loss": 0.6519, + "step": 3373 + }, + { + "epoch": 0.26, + "grad_norm": 1.3214918207990012, + "learning_rate": 1.7313241882875883e-05, + "loss": 0.6443, + "step": 3374 + }, + { + "epoch": 0.26, + "grad_norm": 1.15111137052502, + "learning_rate": 1.7311527934954885e-05, + "loss": 0.5764, + "step": 3375 + }, + { + "epoch": 0.26, + "grad_norm": 1.2135490067987205, + "learning_rate": 1.7309813525419403e-05, + "loss": 0.6, + "step": 3376 + }, + { + "epoch": 0.26, + "grad_norm": 1.0840930781116456, + "learning_rate": 1.730809865437768e-05, + "loss": 0.5167, + "step": 3377 + }, + { + "epoch": 0.26, + "grad_norm": 1.2688477069834467, + "learning_rate": 1.7306383321937986e-05, + "loss": 0.6351, + "step": 3378 + }, + { + "epoch": 0.26, + "grad_norm": 1.141371495777199, + "learning_rate": 1.730466752820862e-05, + "loss": 0.6126, + "step": 3379 + }, + { + "epoch": 0.26, + "grad_norm": 1.304618468142981, + "learning_rate": 1.7302951273297904e-05, + "loss": 0.6449, + "step": 3380 + }, + { + "epoch": 0.26, + "grad_norm": 1.1519705992596048, + "learning_rate": 1.7301234557314194e-05, + "loss": 0.5883, + "step": 3381 + }, + { + "epoch": 0.26, + "grad_norm": 1.200634248331523, + "learning_rate": 1.7299517380365877e-05, + "loss": 0.6639, + "step": 3382 + }, + { + "epoch": 0.26, + "grad_norm": 1.2580156339968729, + "learning_rate": 1.7297799742561367e-05, + "loss": 0.5991, + "step": 3383 + }, + { + "epoch": 0.26, + "grad_norm": 1.2387512951487094, + "learning_rate": 1.729608164400911e-05, + "loss": 0.5469, + "step": 3384 + }, + { + "epoch": 0.26, + "grad_norm": 1.2410634551228308, + "learning_rate": 1.7294363084817573e-05, + "loss": 0.6314, + "step": 3385 + }, + { + "epoch": 0.26, + "grad_norm": 1.2780469539476047, + "learning_rate": 1.7292644065095263e-05, + "loss": 0.6622, + "step": 3386 + }, + { + "epoch": 0.26, + "grad_norm": 1.2191027595386779, + "learning_rate": 1.7290924584950704e-05, + "loss": 0.5835, + "step": 3387 + }, + { + "epoch": 0.26, + "grad_norm": 1.1113196508423067, + "learning_rate": 1.7289204644492463e-05, + "loss": 0.6007, + "step": 3388 + }, + { + "epoch": 0.26, + "grad_norm": 1.1969095668332939, + "learning_rate": 1.7287484243829126e-05, + "loss": 0.6494, + "step": 3389 + }, + { + "epoch": 0.26, + "grad_norm": 1.1119038945732729, + "learning_rate": 1.728576338306931e-05, + "loss": 0.6113, + "step": 3390 + }, + { + "epoch": 0.26, + "grad_norm": 1.302236069939979, + "learning_rate": 1.7284042062321663e-05, + "loss": 0.6146, + "step": 3391 + }, + { + "epoch": 0.26, + "grad_norm": 1.1797731886304792, + "learning_rate": 1.7282320281694857e-05, + "loss": 0.5641, + "step": 3392 + }, + { + "epoch": 0.26, + "grad_norm": 1.254508851136797, + "learning_rate": 1.72805980412976e-05, + "loss": 0.597, + "step": 3393 + }, + { + "epoch": 0.26, + "grad_norm": 1.3392448310061567, + "learning_rate": 1.7278875341238627e-05, + "loss": 0.7139, + "step": 3394 + }, + { + "epoch": 0.26, + "grad_norm": 1.3181460059541965, + "learning_rate": 1.7277152181626703e-05, + "loss": 0.6158, + "step": 3395 + }, + { + "epoch": 0.26, + "grad_norm": 1.2433147473944648, + "learning_rate": 1.727542856257061e-05, + "loss": 0.6183, + "step": 3396 + }, + { + "epoch": 0.26, + "grad_norm": 1.2530361496017832, + "learning_rate": 1.727370448417918e-05, + "loss": 0.6041, + "step": 3397 + }, + { + "epoch": 0.26, + "grad_norm": 1.244108094848589, + "learning_rate": 1.7271979946561256e-05, + "loss": 0.611, + "step": 3398 + }, + { + "epoch": 0.26, + "grad_norm": 1.2718321153313898, + "learning_rate": 1.7270254949825722e-05, + "loss": 0.602, + "step": 3399 + }, + { + "epoch": 0.26, + "grad_norm": 1.1197390462650894, + "learning_rate": 1.726852949408148e-05, + "loss": 0.548, + "step": 3400 + }, + { + "epoch": 0.26, + "grad_norm": 1.1633571561562022, + "learning_rate": 1.7266803579437472e-05, + "loss": 0.6289, + "step": 3401 + }, + { + "epoch": 0.26, + "grad_norm": 1.2020785747590357, + "learning_rate": 1.7265077206002664e-05, + "loss": 0.6174, + "step": 3402 + }, + { + "epoch": 0.26, + "grad_norm": 1.1912704640539116, + "learning_rate": 1.7263350373886046e-05, + "loss": 0.5959, + "step": 3403 + }, + { + "epoch": 0.26, + "grad_norm": 1.212634810592529, + "learning_rate": 1.726162308319665e-05, + "loss": 0.6457, + "step": 3404 + }, + { + "epoch": 0.26, + "grad_norm": 1.1948400822978884, + "learning_rate": 1.7259895334043516e-05, + "loss": 0.6085, + "step": 3405 + }, + { + "epoch": 0.26, + "grad_norm": 1.269153010436185, + "learning_rate": 1.725816712653574e-05, + "loss": 0.6412, + "step": 3406 + }, + { + "epoch": 0.26, + "grad_norm": 1.176745402796988, + "learning_rate": 1.7256438460782427e-05, + "loss": 0.6127, + "step": 3407 + }, + { + "epoch": 0.26, + "grad_norm": 1.1200628026316155, + "learning_rate": 1.725470933689271e-05, + "loss": 0.488, + "step": 3408 + }, + { + "epoch": 0.26, + "grad_norm": 1.2441838854300658, + "learning_rate": 1.7252979754975765e-05, + "loss": 0.6186, + "step": 3409 + }, + { + "epoch": 0.26, + "grad_norm": 1.190531375037393, + "learning_rate": 1.725124971514079e-05, + "loss": 0.6223, + "step": 3410 + }, + { + "epoch": 0.26, + "grad_norm": 1.231457122576695, + "learning_rate": 1.7249519217497007e-05, + "loss": 0.6121, + "step": 3411 + }, + { + "epoch": 0.26, + "grad_norm": 1.1829266572436101, + "learning_rate": 1.7247788262153673e-05, + "loss": 0.6121, + "step": 3412 + }, + { + "epoch": 0.26, + "grad_norm": 1.290336949899096, + "learning_rate": 1.724605684922007e-05, + "loss": 0.6457, + "step": 3413 + }, + { + "epoch": 0.26, + "grad_norm": 1.253085714661441, + "learning_rate": 1.7244324978805516e-05, + "loss": 0.6617, + "step": 3414 + }, + { + "epoch": 0.26, + "grad_norm": 1.4160945054807461, + "learning_rate": 1.7242592651019353e-05, + "loss": 0.6496, + "step": 3415 + }, + { + "epoch": 0.27, + "grad_norm": 1.1085313624865156, + "learning_rate": 1.7240859865970948e-05, + "loss": 0.5781, + "step": 3416 + }, + { + "epoch": 0.27, + "grad_norm": 1.1939958289311587, + "learning_rate": 1.7239126623769703e-05, + "loss": 0.6262, + "step": 3417 + }, + { + "epoch": 0.27, + "grad_norm": 1.2282199716125397, + "learning_rate": 1.7237392924525037e-05, + "loss": 0.6242, + "step": 3418 + }, + { + "epoch": 0.27, + "grad_norm": 1.2968295905194807, + "learning_rate": 1.7235658768346422e-05, + "loss": 0.6765, + "step": 3419 + }, + { + "epoch": 0.27, + "grad_norm": 1.2039937932529343, + "learning_rate": 1.723392415534334e-05, + "loss": 0.551, + "step": 3420 + }, + { + "epoch": 0.27, + "grad_norm": 1.182970392732156, + "learning_rate": 1.72321890856253e-05, + "loss": 0.5846, + "step": 3421 + }, + { + "epoch": 0.27, + "grad_norm": 1.2279794368917152, + "learning_rate": 1.723045355930185e-05, + "loss": 0.5978, + "step": 3422 + }, + { + "epoch": 0.27, + "grad_norm": 1.1248688621383214, + "learning_rate": 1.7228717576482563e-05, + "loss": 0.5783, + "step": 3423 + }, + { + "epoch": 0.27, + "grad_norm": 1.3525211704874793, + "learning_rate": 1.722698113727704e-05, + "loss": 0.6685, + "step": 3424 + }, + { + "epoch": 0.27, + "grad_norm": 1.1919329370220593, + "learning_rate": 1.7225244241794916e-05, + "loss": 0.528, + "step": 3425 + }, + { + "epoch": 0.27, + "grad_norm": 1.2538218722370853, + "learning_rate": 1.7223506890145842e-05, + "loss": 0.6043, + "step": 3426 + }, + { + "epoch": 0.27, + "grad_norm": 1.1875692648513518, + "learning_rate": 1.7221769082439508e-05, + "loss": 0.5975, + "step": 3427 + }, + { + "epoch": 0.27, + "grad_norm": 1.2182915143056035, + "learning_rate": 1.7220030818785635e-05, + "loss": 0.6354, + "step": 3428 + }, + { + "epoch": 0.27, + "grad_norm": 1.2227457519532332, + "learning_rate": 1.721829209929396e-05, + "loss": 0.6272, + "step": 3429 + }, + { + "epoch": 0.27, + "grad_norm": 1.271587361795596, + "learning_rate": 1.721655292407427e-05, + "loss": 0.684, + "step": 3430 + }, + { + "epoch": 0.27, + "grad_norm": 1.1728294046486247, + "learning_rate": 1.721481329323636e-05, + "loss": 0.5759, + "step": 3431 + }, + { + "epoch": 0.27, + "grad_norm": 1.2359590627378376, + "learning_rate": 1.7213073206890063e-05, + "loss": 0.6133, + "step": 3432 + }, + { + "epoch": 0.27, + "grad_norm": 1.2561826868195167, + "learning_rate": 1.721133266514524e-05, + "loss": 0.6566, + "step": 3433 + }, + { + "epoch": 0.27, + "grad_norm": 1.1822220309864055, + "learning_rate": 1.720959166811178e-05, + "loss": 0.6348, + "step": 3434 + }, + { + "epoch": 0.27, + "grad_norm": 1.3352234476945937, + "learning_rate": 1.72078502158996e-05, + "loss": 0.6142, + "step": 3435 + }, + { + "epoch": 0.27, + "grad_norm": 1.2966064266947603, + "learning_rate": 1.720610830861865e-05, + "loss": 0.6492, + "step": 3436 + }, + { + "epoch": 0.27, + "grad_norm": 1.0881068564386256, + "learning_rate": 1.7204365946378906e-05, + "loss": 0.5589, + "step": 3437 + }, + { + "epoch": 0.27, + "grad_norm": 1.13118300476648, + "learning_rate": 1.7202623129290367e-05, + "loss": 0.5947, + "step": 3438 + }, + { + "epoch": 0.27, + "grad_norm": 1.2141565176007922, + "learning_rate": 1.720087985746307e-05, + "loss": 0.5997, + "step": 3439 + }, + { + "epoch": 0.27, + "grad_norm": 1.2549494506716794, + "learning_rate": 1.7199136131007074e-05, + "loss": 0.5939, + "step": 3440 + }, + { + "epoch": 0.27, + "grad_norm": 1.1754788234755198, + "learning_rate": 1.719739195003247e-05, + "loss": 0.6065, + "step": 3441 + }, + { + "epoch": 0.27, + "grad_norm": 1.1796197934962993, + "learning_rate": 1.7195647314649383e-05, + "loss": 0.599, + "step": 3442 + }, + { + "epoch": 0.27, + "grad_norm": 1.2069171740503573, + "learning_rate": 1.7193902224967956e-05, + "loss": 0.6181, + "step": 3443 + }, + { + "epoch": 0.27, + "grad_norm": 1.3393779868174052, + "learning_rate": 1.7192156681098364e-05, + "loss": 0.5985, + "step": 3444 + }, + { + "epoch": 0.27, + "grad_norm": 1.1614857771614773, + "learning_rate": 1.7190410683150816e-05, + "loss": 0.5979, + "step": 3445 + }, + { + "epoch": 0.27, + "grad_norm": 1.221602207962269, + "learning_rate": 1.7188664231235544e-05, + "loss": 0.5734, + "step": 3446 + }, + { + "epoch": 0.27, + "grad_norm": 1.2435657844030366, + "learning_rate": 1.7186917325462808e-05, + "loss": 0.6279, + "step": 3447 + }, + { + "epoch": 0.27, + "grad_norm": 1.252003209009415, + "learning_rate": 1.71851699659429e-05, + "loss": 0.6782, + "step": 3448 + }, + { + "epoch": 0.27, + "grad_norm": 1.168796054932257, + "learning_rate": 1.7183422152786145e-05, + "loss": 0.5885, + "step": 3449 + }, + { + "epoch": 0.27, + "grad_norm": 1.1936584697205102, + "learning_rate": 1.718167388610289e-05, + "loss": 0.5842, + "step": 3450 + }, + { + "epoch": 0.27, + "grad_norm": 1.1738174107930588, + "learning_rate": 1.7179925166003506e-05, + "loss": 0.5997, + "step": 3451 + }, + { + "epoch": 0.27, + "grad_norm": 1.1548514284630382, + "learning_rate": 1.71781759925984e-05, + "loss": 0.5791, + "step": 3452 + }, + { + "epoch": 0.27, + "grad_norm": 1.23274374263663, + "learning_rate": 1.7176426365998015e-05, + "loss": 0.6286, + "step": 3453 + }, + { + "epoch": 0.27, + "grad_norm": 1.1556862668290826, + "learning_rate": 1.7174676286312807e-05, + "loss": 0.5293, + "step": 3454 + }, + { + "epoch": 0.27, + "grad_norm": 1.2823091641597584, + "learning_rate": 1.717292575365327e-05, + "loss": 0.6569, + "step": 3455 + }, + { + "epoch": 0.27, + "grad_norm": 1.2788120428492529, + "learning_rate": 1.717117476812992e-05, + "loss": 0.6217, + "step": 3456 + }, + { + "epoch": 0.27, + "grad_norm": 1.3612913020009727, + "learning_rate": 1.7169423329853307e-05, + "loss": 0.687, + "step": 3457 + }, + { + "epoch": 0.27, + "grad_norm": 1.3048652482160341, + "learning_rate": 1.7167671438934014e-05, + "loss": 0.5577, + "step": 3458 + }, + { + "epoch": 0.27, + "grad_norm": 1.127044674015843, + "learning_rate": 1.7165919095482636e-05, + "loss": 0.5872, + "step": 3459 + }, + { + "epoch": 0.27, + "grad_norm": 1.1676144780267974, + "learning_rate": 1.716416629960982e-05, + "loss": 0.6488, + "step": 3460 + }, + { + "epoch": 0.27, + "grad_norm": 1.365012744382953, + "learning_rate": 1.7162413051426222e-05, + "loss": 0.6613, + "step": 3461 + }, + { + "epoch": 0.27, + "grad_norm": 1.0879256895156963, + "learning_rate": 1.7160659351042533e-05, + "loss": 0.5603, + "step": 3462 + }, + { + "epoch": 0.27, + "grad_norm": 1.1720538193645633, + "learning_rate": 1.7158905198569476e-05, + "loss": 0.5483, + "step": 3463 + }, + { + "epoch": 0.27, + "grad_norm": 1.23895635162207, + "learning_rate": 1.7157150594117805e-05, + "loss": 0.6116, + "step": 3464 + }, + { + "epoch": 0.27, + "grad_norm": 1.1681729651696895, + "learning_rate": 1.7155395537798282e-05, + "loss": 0.5921, + "step": 3465 + }, + { + "epoch": 0.27, + "grad_norm": 1.2154166781050915, + "learning_rate": 1.7153640029721726e-05, + "loss": 0.6222, + "step": 3466 + }, + { + "epoch": 0.27, + "grad_norm": 1.2614323908151381, + "learning_rate": 1.7151884069998966e-05, + "loss": 0.6172, + "step": 3467 + }, + { + "epoch": 0.27, + "grad_norm": 1.1960065235546904, + "learning_rate": 1.7150127658740868e-05, + "loss": 0.5752, + "step": 3468 + }, + { + "epoch": 0.27, + "grad_norm": 1.2162324383086238, + "learning_rate": 1.7148370796058316e-05, + "loss": 0.6484, + "step": 3469 + }, + { + "epoch": 0.27, + "grad_norm": 1.0929767054826567, + "learning_rate": 1.714661348206224e-05, + "loss": 0.5337, + "step": 3470 + }, + { + "epoch": 0.27, + "grad_norm": 1.2129818303067432, + "learning_rate": 1.714485571686358e-05, + "loss": 0.6408, + "step": 3471 + }, + { + "epoch": 0.27, + "grad_norm": 1.2663385710849338, + "learning_rate": 1.7143097500573314e-05, + "loss": 0.6163, + "step": 3472 + }, + { + "epoch": 0.27, + "grad_norm": 1.3020229427320296, + "learning_rate": 1.7141338833302454e-05, + "loss": 0.6458, + "step": 3473 + }, + { + "epoch": 0.27, + "grad_norm": 1.2019619955951104, + "learning_rate": 1.713957971516203e-05, + "loss": 0.6255, + "step": 3474 + }, + { + "epoch": 0.27, + "grad_norm": 1.2143210117435042, + "learning_rate": 1.71378201462631e-05, + "loss": 0.5798, + "step": 3475 + }, + { + "epoch": 0.27, + "grad_norm": 1.2721459795989332, + "learning_rate": 1.7136060126716756e-05, + "loss": 0.6014, + "step": 3476 + }, + { + "epoch": 0.27, + "grad_norm": 1.1761004752814077, + "learning_rate": 1.7134299656634124e-05, + "loss": 0.5792, + "step": 3477 + }, + { + "epoch": 0.27, + "grad_norm": 1.1795035717802151, + "learning_rate": 1.7132538736126342e-05, + "loss": 0.6121, + "step": 3478 + }, + { + "epoch": 0.27, + "grad_norm": 1.2644927529202392, + "learning_rate": 1.713077736530459e-05, + "loss": 0.6117, + "step": 3479 + }, + { + "epoch": 0.27, + "grad_norm": 1.2594005913417883, + "learning_rate": 1.712901554428008e-05, + "loss": 0.6772, + "step": 3480 + }, + { + "epoch": 0.27, + "grad_norm": 1.3499656902474506, + "learning_rate": 1.7127253273164032e-05, + "loss": 0.6002, + "step": 3481 + }, + { + "epoch": 0.27, + "grad_norm": 1.2425175834040978, + "learning_rate": 1.7125490552067713e-05, + "loss": 0.6348, + "step": 3482 + }, + { + "epoch": 0.27, + "grad_norm": 1.1711106414289294, + "learning_rate": 1.7123727381102417e-05, + "loss": 0.6374, + "step": 3483 + }, + { + "epoch": 0.27, + "grad_norm": 1.1554935145738767, + "learning_rate": 1.7121963760379453e-05, + "loss": 0.5657, + "step": 3484 + }, + { + "epoch": 0.27, + "grad_norm": 1.6971287995949367, + "learning_rate": 1.7120199690010176e-05, + "loss": 0.6261, + "step": 3485 + }, + { + "epoch": 0.27, + "grad_norm": 1.1980711077219164, + "learning_rate": 1.7118435170105955e-05, + "loss": 0.6241, + "step": 3486 + }, + { + "epoch": 0.27, + "grad_norm": 1.2369071484397642, + "learning_rate": 1.7116670200778192e-05, + "loss": 0.6096, + "step": 3487 + }, + { + "epoch": 0.27, + "grad_norm": 1.143335805219271, + "learning_rate": 1.711490478213833e-05, + "loss": 0.5794, + "step": 3488 + }, + { + "epoch": 0.27, + "grad_norm": 1.3044845057656975, + "learning_rate": 1.7113138914297817e-05, + "loss": 0.5988, + "step": 3489 + }, + { + "epoch": 0.27, + "grad_norm": 1.2158888464306223, + "learning_rate": 1.7111372597368143e-05, + "loss": 0.6161, + "step": 3490 + }, + { + "epoch": 0.27, + "grad_norm": 1.1621666132831512, + "learning_rate": 1.7109605831460833e-05, + "loss": 0.597, + "step": 3491 + }, + { + "epoch": 0.27, + "grad_norm": 1.2148988472075475, + "learning_rate": 1.710783861668742e-05, + "loss": 0.5849, + "step": 3492 + }, + { + "epoch": 0.27, + "grad_norm": 1.2343771246396866, + "learning_rate": 1.7106070953159487e-05, + "loss": 0.6387, + "step": 3493 + }, + { + "epoch": 0.27, + "grad_norm": 1.1944441789188138, + "learning_rate": 1.710430284098863e-05, + "loss": 0.5823, + "step": 3494 + }, + { + "epoch": 0.27, + "grad_norm": 1.1677784844503332, + "learning_rate": 1.7102534280286483e-05, + "loss": 0.5873, + "step": 3495 + }, + { + "epoch": 0.27, + "grad_norm": 1.1186957149015377, + "learning_rate": 1.71007652711647e-05, + "loss": 0.5679, + "step": 3496 + }, + { + "epoch": 0.27, + "grad_norm": 1.2166223293656484, + "learning_rate": 1.7098995813734974e-05, + "loss": 0.6354, + "step": 3497 + }, + { + "epoch": 0.27, + "grad_norm": 1.2219242915286959, + "learning_rate": 1.7097225908109015e-05, + "loss": 0.6316, + "step": 3498 + }, + { + "epoch": 0.27, + "grad_norm": 1.1823154508688276, + "learning_rate": 1.7095455554398564e-05, + "loss": 0.5865, + "step": 3499 + }, + { + "epoch": 0.27, + "grad_norm": 1.2132324130318173, + "learning_rate": 1.70936847527154e-05, + "loss": 0.5596, + "step": 3500 + }, + { + "epoch": 0.27, + "grad_norm": 1.2081526697959808, + "learning_rate": 1.709191350317132e-05, + "loss": 0.6299, + "step": 3501 + }, + { + "epoch": 0.27, + "grad_norm": 1.1080447535927218, + "learning_rate": 1.709014180587815e-05, + "loss": 0.5576, + "step": 3502 + }, + { + "epoch": 0.27, + "grad_norm": 1.2654575189991, + "learning_rate": 1.7088369660947743e-05, + "loss": 0.6246, + "step": 3503 + }, + { + "epoch": 0.27, + "grad_norm": 1.1062880536447204, + "learning_rate": 1.7086597068491994e-05, + "loss": 0.5676, + "step": 3504 + }, + { + "epoch": 0.27, + "grad_norm": 1.1743979697343756, + "learning_rate": 1.7084824028622807e-05, + "loss": 0.6279, + "step": 3505 + }, + { + "epoch": 0.27, + "grad_norm": 1.2516906272653687, + "learning_rate": 1.708305054145213e-05, + "loss": 0.6408, + "step": 3506 + }, + { + "epoch": 0.27, + "grad_norm": 1.112168194012932, + "learning_rate": 1.7081276607091925e-05, + "loss": 0.5922, + "step": 3507 + }, + { + "epoch": 0.27, + "grad_norm": 1.2045648059631897, + "learning_rate": 1.7079502225654192e-05, + "loss": 0.5833, + "step": 3508 + }, + { + "epoch": 0.27, + "grad_norm": 1.2075480123416586, + "learning_rate": 1.707772739725096e-05, + "loss": 0.525, + "step": 3509 + }, + { + "epoch": 0.27, + "grad_norm": 1.178969006545972, + "learning_rate": 1.7075952121994282e-05, + "loss": 0.5985, + "step": 3510 + }, + { + "epoch": 0.27, + "grad_norm": 1.2524471171287925, + "learning_rate": 1.707417639999624e-05, + "loss": 0.6069, + "step": 3511 + }, + { + "epoch": 0.27, + "grad_norm": 1.2340928250120227, + "learning_rate": 1.707240023136894e-05, + "loss": 0.5795, + "step": 3512 + }, + { + "epoch": 0.27, + "grad_norm": 1.1014666684718406, + "learning_rate": 1.7070623616224528e-05, + "loss": 0.578, + "step": 3513 + }, + { + "epoch": 0.27, + "grad_norm": 1.1604010840601888, + "learning_rate": 1.7068846554675166e-05, + "loss": 0.5926, + "step": 3514 + }, + { + "epoch": 0.27, + "grad_norm": 1.293871745771315, + "learning_rate": 1.706706904683305e-05, + "loss": 0.6135, + "step": 3515 + }, + { + "epoch": 0.27, + "grad_norm": 1.1432762165279406, + "learning_rate": 1.7065291092810406e-05, + "loss": 0.5393, + "step": 3516 + }, + { + "epoch": 0.27, + "grad_norm": 1.262780890990764, + "learning_rate": 1.7063512692719482e-05, + "loss": 0.6187, + "step": 3517 + }, + { + "epoch": 0.27, + "grad_norm": 1.1681728120982255, + "learning_rate": 1.7061733846672562e-05, + "loss": 0.5414, + "step": 3518 + }, + { + "epoch": 0.27, + "grad_norm": 1.1907958921806925, + "learning_rate": 1.7059954554781945e-05, + "loss": 0.5693, + "step": 3519 + }, + { + "epoch": 0.27, + "grad_norm": 1.2319715254391224, + "learning_rate": 1.7058174817159973e-05, + "loss": 0.6132, + "step": 3520 + }, + { + "epoch": 0.27, + "grad_norm": 1.240147577086982, + "learning_rate": 1.7056394633919012e-05, + "loss": 0.6046, + "step": 3521 + }, + { + "epoch": 0.27, + "grad_norm": 1.5087442632153663, + "learning_rate": 1.705461400517145e-05, + "loss": 0.5853, + "step": 3522 + }, + { + "epoch": 0.27, + "grad_norm": 1.4098506184169337, + "learning_rate": 1.705283293102971e-05, + "loss": 0.5993, + "step": 3523 + }, + { + "epoch": 0.27, + "grad_norm": 1.2521401680706161, + "learning_rate": 1.7051051411606238e-05, + "loss": 0.5966, + "step": 3524 + }, + { + "epoch": 0.27, + "grad_norm": 1.2914691076656413, + "learning_rate": 1.7049269447013515e-05, + "loss": 0.6186, + "step": 3525 + }, + { + "epoch": 0.27, + "grad_norm": 1.3109645945280441, + "learning_rate": 1.704748703736404e-05, + "loss": 0.6218, + "step": 3526 + }, + { + "epoch": 0.27, + "grad_norm": 1.2406136000563492, + "learning_rate": 1.7045704182770346e-05, + "loss": 0.5841, + "step": 3527 + }, + { + "epoch": 0.27, + "grad_norm": 1.267768974807635, + "learning_rate": 1.7043920883344998e-05, + "loss": 0.5947, + "step": 3528 + }, + { + "epoch": 0.27, + "grad_norm": 1.2891440163471761, + "learning_rate": 1.7042137139200583e-05, + "loss": 0.6555, + "step": 3529 + }, + { + "epoch": 0.27, + "grad_norm": 1.2079516608248553, + "learning_rate": 1.7040352950449716e-05, + "loss": 0.6532, + "step": 3530 + }, + { + "epoch": 0.27, + "grad_norm": 1.1711107432205794, + "learning_rate": 1.7038568317205045e-05, + "loss": 0.5984, + "step": 3531 + }, + { + "epoch": 0.27, + "grad_norm": 1.2320295335059828, + "learning_rate": 1.7036783239579243e-05, + "loss": 0.6114, + "step": 3532 + }, + { + "epoch": 0.27, + "grad_norm": 1.3518348816285979, + "learning_rate": 1.703499771768501e-05, + "loss": 0.6722, + "step": 3533 + }, + { + "epoch": 0.27, + "grad_norm": 1.419602474930434, + "learning_rate": 1.7033211751635074e-05, + "loss": 0.6368, + "step": 3534 + }, + { + "epoch": 0.27, + "grad_norm": 20.15933993999819, + "learning_rate": 1.7031425341542193e-05, + "loss": 0.6554, + "step": 3535 + }, + { + "epoch": 0.27, + "grad_norm": 1.2699663561021641, + "learning_rate": 1.7029638487519155e-05, + "loss": 0.6151, + "step": 3536 + }, + { + "epoch": 0.27, + "grad_norm": 1.3049581098592968, + "learning_rate": 1.702785118967877e-05, + "loss": 0.622, + "step": 3537 + }, + { + "epoch": 0.27, + "grad_norm": 1.2158346274885192, + "learning_rate": 1.702606344813388e-05, + "loss": 0.5997, + "step": 3538 + }, + { + "epoch": 0.27, + "grad_norm": 1.1389321083820854, + "learning_rate": 1.7024275262997358e-05, + "loss": 0.5108, + "step": 3539 + }, + { + "epoch": 0.27, + "grad_norm": 1.6805761780887818, + "learning_rate": 1.702248663438209e-05, + "loss": 0.6208, + "step": 3540 + }, + { + "epoch": 0.27, + "grad_norm": 1.3455553125886481, + "learning_rate": 1.7020697562401017e-05, + "loss": 0.5675, + "step": 3541 + }, + { + "epoch": 0.27, + "grad_norm": 1.0953874592292865, + "learning_rate": 1.7018908047167083e-05, + "loss": 0.5658, + "step": 3542 + }, + { + "epoch": 0.27, + "grad_norm": 1.1857180526808158, + "learning_rate": 1.7017118088793267e-05, + "loss": 0.609, + "step": 3543 + }, + { + "epoch": 0.27, + "grad_norm": 1.150018343571857, + "learning_rate": 1.701532768739259e-05, + "loss": 0.5614, + "step": 3544 + }, + { + "epoch": 0.28, + "grad_norm": 1.2616653667293758, + "learning_rate": 1.7013536843078077e-05, + "loss": 0.6334, + "step": 3545 + }, + { + "epoch": 0.28, + "grad_norm": 1.2122241168430332, + "learning_rate": 1.70117455559628e-05, + "loss": 0.5925, + "step": 3546 + }, + { + "epoch": 0.28, + "grad_norm": 1.171793820430402, + "learning_rate": 1.700995382615985e-05, + "loss": 0.5664, + "step": 3547 + }, + { + "epoch": 0.28, + "grad_norm": 1.1670900439489724, + "learning_rate": 1.7008161653782344e-05, + "loss": 0.5695, + "step": 3548 + }, + { + "epoch": 0.28, + "grad_norm": 1.1869272055250546, + "learning_rate": 1.7006369038943443e-05, + "loss": 0.5299, + "step": 3549 + }, + { + "epoch": 0.28, + "grad_norm": 1.1249377975221864, + "learning_rate": 1.700457598175631e-05, + "loss": 0.5286, + "step": 3550 + }, + { + "epoch": 0.28, + "grad_norm": 1.2016604042595487, + "learning_rate": 1.700278248233416e-05, + "loss": 0.6292, + "step": 3551 + }, + { + "epoch": 0.28, + "grad_norm": 1.1327456224365782, + "learning_rate": 1.700098854079022e-05, + "loss": 0.5653, + "step": 3552 + }, + { + "epoch": 0.28, + "grad_norm": 1.146229860301497, + "learning_rate": 1.6999194157237753e-05, + "loss": 0.5406, + "step": 3553 + }, + { + "epoch": 0.28, + "grad_norm": 1.2107167196510564, + "learning_rate": 1.699739933179005e-05, + "loss": 0.6163, + "step": 3554 + }, + { + "epoch": 0.28, + "grad_norm": 1.128816013623664, + "learning_rate": 1.6995604064560426e-05, + "loss": 0.5878, + "step": 3555 + }, + { + "epoch": 0.28, + "grad_norm": 1.2362328563461915, + "learning_rate": 1.699380835566222e-05, + "loss": 0.567, + "step": 3556 + }, + { + "epoch": 0.28, + "grad_norm": 1.1748535450505353, + "learning_rate": 1.6992012205208814e-05, + "loss": 0.6064, + "step": 3557 + }, + { + "epoch": 0.28, + "grad_norm": 1.2243381911815883, + "learning_rate": 1.6990215613313602e-05, + "loss": 0.5744, + "step": 3558 + }, + { + "epoch": 0.28, + "grad_norm": 1.2039026001634172, + "learning_rate": 1.6988418580090013e-05, + "loss": 0.6304, + "step": 3559 + }, + { + "epoch": 0.28, + "grad_norm": 1.1472622313821053, + "learning_rate": 1.6986621105651505e-05, + "loss": 0.5126, + "step": 3560 + }, + { + "epoch": 0.28, + "grad_norm": 1.2402514358908967, + "learning_rate": 1.6984823190111558e-05, + "loss": 0.6169, + "step": 3561 + }, + { + "epoch": 0.28, + "grad_norm": 1.2778025978322296, + "learning_rate": 1.698302483358369e-05, + "loss": 0.5921, + "step": 3562 + }, + { + "epoch": 0.28, + "grad_norm": 1.1746282659089236, + "learning_rate": 1.6981226036181433e-05, + "loss": 0.6314, + "step": 3563 + }, + { + "epoch": 0.28, + "grad_norm": 1.2409812300505223, + "learning_rate": 1.6979426798018356e-05, + "loss": 0.5945, + "step": 3564 + }, + { + "epoch": 0.28, + "grad_norm": 1.140078322162819, + "learning_rate": 1.697762711920806e-05, + "loss": 0.5977, + "step": 3565 + }, + { + "epoch": 0.28, + "grad_norm": 1.327162854114233, + "learning_rate": 1.697582699986416e-05, + "loss": 0.6625, + "step": 3566 + }, + { + "epoch": 0.28, + "grad_norm": 1.195544500836556, + "learning_rate": 1.697402644010032e-05, + "loss": 0.59, + "step": 3567 + }, + { + "epoch": 0.28, + "grad_norm": 1.2330646561611454, + "learning_rate": 1.6972225440030203e-05, + "loss": 0.5952, + "step": 3568 + }, + { + "epoch": 0.28, + "grad_norm": 1.3666670326294448, + "learning_rate": 1.697042399976752e-05, + "loss": 0.6384, + "step": 3569 + }, + { + "epoch": 0.28, + "grad_norm": 1.1439809680636073, + "learning_rate": 1.6968622119426013e-05, + "loss": 0.5658, + "step": 3570 + }, + { + "epoch": 0.28, + "grad_norm": 1.349977611426743, + "learning_rate": 1.696681979911943e-05, + "loss": 0.6397, + "step": 3571 + }, + { + "epoch": 0.28, + "grad_norm": 1.1624509165256394, + "learning_rate": 1.696501703896158e-05, + "loss": 0.6571, + "step": 3572 + }, + { + "epoch": 0.28, + "grad_norm": 1.1602184651451246, + "learning_rate": 1.6963213839066263e-05, + "loss": 0.5956, + "step": 3573 + }, + { + "epoch": 0.28, + "grad_norm": 1.1053391774942984, + "learning_rate": 1.696141019954733e-05, + "loss": 0.5249, + "step": 3574 + }, + { + "epoch": 0.28, + "grad_norm": 1.093012588418014, + "learning_rate": 1.6959606120518656e-05, + "loss": 0.5847, + "step": 3575 + }, + { + "epoch": 0.28, + "grad_norm": 1.136895077587339, + "learning_rate": 1.695780160209414e-05, + "loss": 0.5928, + "step": 3576 + }, + { + "epoch": 0.28, + "grad_norm": 1.148027878974376, + "learning_rate": 1.6955996644387715e-05, + "loss": 0.6394, + "step": 3577 + }, + { + "epoch": 0.28, + "grad_norm": 1.2532928963150862, + "learning_rate": 1.695419124751333e-05, + "loss": 0.6068, + "step": 3578 + }, + { + "epoch": 0.28, + "grad_norm": 1.1299222710131411, + "learning_rate": 1.6952385411584974e-05, + "loss": 0.6031, + "step": 3579 + }, + { + "epoch": 0.28, + "grad_norm": 1.193586362238359, + "learning_rate": 1.695057913671666e-05, + "loss": 0.5865, + "step": 3580 + }, + { + "epoch": 0.28, + "grad_norm": 1.2034823890066315, + "learning_rate": 1.694877242302242e-05, + "loss": 0.5917, + "step": 3581 + }, + { + "epoch": 0.28, + "grad_norm": 1.2284215457475098, + "learning_rate": 1.694696527061633e-05, + "loss": 0.6301, + "step": 3582 + }, + { + "epoch": 0.28, + "grad_norm": 1.1804113630361248, + "learning_rate": 1.6945157679612478e-05, + "loss": 0.5869, + "step": 3583 + }, + { + "epoch": 0.28, + "grad_norm": 1.251924511467942, + "learning_rate": 1.694334965012499e-05, + "loss": 0.6105, + "step": 3584 + }, + { + "epoch": 0.28, + "grad_norm": 1.2661377139075032, + "learning_rate": 1.6941541182268015e-05, + "loss": 0.5938, + "step": 3585 + }, + { + "epoch": 0.28, + "grad_norm": 1.2739834697465422, + "learning_rate": 1.6939732276155733e-05, + "loss": 0.5946, + "step": 3586 + }, + { + "epoch": 0.28, + "grad_norm": 1.1314964278936366, + "learning_rate": 1.6937922931902348e-05, + "loss": 0.5699, + "step": 3587 + }, + { + "epoch": 0.28, + "grad_norm": 1.1397968578040283, + "learning_rate": 1.6936113149622093e-05, + "loss": 0.5665, + "step": 3588 + }, + { + "epoch": 0.28, + "grad_norm": 1.1232114986339181, + "learning_rate": 1.6934302929429226e-05, + "loss": 0.5357, + "step": 3589 + }, + { + "epoch": 0.28, + "grad_norm": 1.2606855478720311, + "learning_rate": 1.6932492271438046e-05, + "loss": 0.6386, + "step": 3590 + }, + { + "epoch": 0.28, + "grad_norm": 1.2709180094957568, + "learning_rate": 1.6930681175762855e-05, + "loss": 0.6314, + "step": 3591 + }, + { + "epoch": 0.28, + "grad_norm": 1.1488307876322585, + "learning_rate": 1.692886964251801e-05, + "loss": 0.5412, + "step": 3592 + }, + { + "epoch": 0.28, + "grad_norm": 1.1900822020436885, + "learning_rate": 1.6927057671817872e-05, + "loss": 0.5871, + "step": 3593 + }, + { + "epoch": 0.28, + "grad_norm": 1.1997646120790852, + "learning_rate": 1.6925245263776842e-05, + "loss": 0.6134, + "step": 3594 + }, + { + "epoch": 0.28, + "grad_norm": 1.15087450404533, + "learning_rate": 1.6923432418509356e-05, + "loss": 0.6305, + "step": 3595 + }, + { + "epoch": 0.28, + "grad_norm": 1.150878543716134, + "learning_rate": 1.6921619136129856e-05, + "loss": 0.6098, + "step": 3596 + }, + { + "epoch": 0.28, + "grad_norm": 1.2434921131474457, + "learning_rate": 1.691980541675283e-05, + "loss": 0.6404, + "step": 3597 + }, + { + "epoch": 0.28, + "grad_norm": 1.3097038456982897, + "learning_rate": 1.6917991260492787e-05, + "loss": 0.611, + "step": 3598 + }, + { + "epoch": 0.28, + "grad_norm": 1.229576829052618, + "learning_rate": 1.691617666746426e-05, + "loss": 0.5951, + "step": 3599 + }, + { + "epoch": 0.28, + "grad_norm": 1.2509677478679255, + "learning_rate": 1.691436163778182e-05, + "loss": 0.6254, + "step": 3600 + }, + { + "epoch": 0.28, + "grad_norm": 1.2207993126166072, + "learning_rate": 1.691254617156006e-05, + "loss": 0.6067, + "step": 3601 + }, + { + "epoch": 0.28, + "grad_norm": 1.2126412496190742, + "learning_rate": 1.6910730268913593e-05, + "loss": 0.6069, + "step": 3602 + }, + { + "epoch": 0.28, + "grad_norm": 1.1482466454767337, + "learning_rate": 1.690891392995707e-05, + "loss": 0.5591, + "step": 3603 + }, + { + "epoch": 0.28, + "grad_norm": 1.2779331072552105, + "learning_rate": 1.6907097154805162e-05, + "loss": 0.6097, + "step": 3604 + }, + { + "epoch": 0.28, + "grad_norm": 1.1215292054998223, + "learning_rate": 1.690527994357258e-05, + "loss": 0.5335, + "step": 3605 + }, + { + "epoch": 0.28, + "grad_norm": 1.2097671267358798, + "learning_rate": 1.6903462296374048e-05, + "loss": 0.5955, + "step": 3606 + }, + { + "epoch": 0.28, + "grad_norm": 1.265338205880517, + "learning_rate": 1.690164421332432e-05, + "loss": 0.6472, + "step": 3607 + }, + { + "epoch": 0.28, + "grad_norm": 1.1631976505570645, + "learning_rate": 1.689982569453819e-05, + "loss": 0.6159, + "step": 3608 + }, + { + "epoch": 0.28, + "grad_norm": 1.139486554701297, + "learning_rate": 1.689800674013046e-05, + "loss": 0.5496, + "step": 3609 + }, + { + "epoch": 0.28, + "grad_norm": 1.2219905808362046, + "learning_rate": 1.6896187350215977e-05, + "loss": 0.5765, + "step": 3610 + }, + { + "epoch": 0.28, + "grad_norm": 1.128260021636023, + "learning_rate": 1.689436752490961e-05, + "loss": 0.5796, + "step": 3611 + }, + { + "epoch": 0.28, + "grad_norm": 1.1476668282703644, + "learning_rate": 1.689254726432625e-05, + "loss": 0.5473, + "step": 3612 + }, + { + "epoch": 0.28, + "grad_norm": 1.2214486492947931, + "learning_rate": 1.689072656858082e-05, + "loss": 0.5615, + "step": 3613 + }, + { + "epoch": 0.28, + "grad_norm": 1.2743865145340358, + "learning_rate": 1.6888905437788268e-05, + "loss": 0.6311, + "step": 3614 + }, + { + "epoch": 0.28, + "grad_norm": 1.1860746060654277, + "learning_rate": 1.6887083872063574e-05, + "loss": 0.5759, + "step": 3615 + }, + { + "epoch": 0.28, + "grad_norm": 1.2199671120979143, + "learning_rate": 1.6885261871521746e-05, + "loss": 0.6144, + "step": 3616 + }, + { + "epoch": 0.28, + "grad_norm": 1.1535508380066237, + "learning_rate": 1.688343943627781e-05, + "loss": 0.5686, + "step": 3617 + }, + { + "epoch": 0.28, + "grad_norm": 1.1471007995347282, + "learning_rate": 1.6881616566446827e-05, + "loss": 0.5784, + "step": 3618 + }, + { + "epoch": 0.28, + "grad_norm": 1.2844511376156666, + "learning_rate": 1.6879793262143888e-05, + "loss": 0.6096, + "step": 3619 + }, + { + "epoch": 0.28, + "grad_norm": 1.2784397661016051, + "learning_rate": 1.6877969523484107e-05, + "loss": 0.6085, + "step": 3620 + }, + { + "epoch": 0.28, + "grad_norm": 1.2485378774076965, + "learning_rate": 1.6876145350582623e-05, + "loss": 0.6199, + "step": 3621 + }, + { + "epoch": 0.28, + "grad_norm": 1.4544783821951675, + "learning_rate": 1.6874320743554605e-05, + "loss": 0.6546, + "step": 3622 + }, + { + "epoch": 0.28, + "grad_norm": 1.262795192842514, + "learning_rate": 1.6872495702515253e-05, + "loss": 0.6071, + "step": 3623 + }, + { + "epoch": 0.28, + "grad_norm": 1.1761441098587606, + "learning_rate": 1.6870670227579788e-05, + "loss": 0.6048, + "step": 3624 + }, + { + "epoch": 0.28, + "grad_norm": 1.1147398912989954, + "learning_rate": 1.6868844318863466e-05, + "loss": 0.5831, + "step": 3625 + }, + { + "epoch": 0.28, + "grad_norm": 1.2985301199332893, + "learning_rate": 1.6867017976481563e-05, + "loss": 0.6552, + "step": 3626 + }, + { + "epoch": 0.28, + "grad_norm": 1.1591833897108108, + "learning_rate": 1.6865191200549387e-05, + "loss": 0.619, + "step": 3627 + }, + { + "epoch": 0.28, + "grad_norm": 1.3361089049911703, + "learning_rate": 1.686336399118227e-05, + "loss": 0.6389, + "step": 3628 + }, + { + "epoch": 0.28, + "grad_norm": 1.1874962856836957, + "learning_rate": 1.686153634849557e-05, + "loss": 0.5827, + "step": 3629 + }, + { + "epoch": 0.28, + "grad_norm": 1.1986359413308774, + "learning_rate": 1.6859708272604685e-05, + "loss": 0.5449, + "step": 3630 + }, + { + "epoch": 0.28, + "grad_norm": 1.1426136500386732, + "learning_rate": 1.6857879763625023e-05, + "loss": 0.5697, + "step": 3631 + }, + { + "epoch": 0.28, + "grad_norm": 1.2424993543412126, + "learning_rate": 1.6856050821672028e-05, + "loss": 0.6041, + "step": 3632 + }, + { + "epoch": 0.28, + "grad_norm": 1.279995187511934, + "learning_rate": 1.6854221446861175e-05, + "loss": 0.6209, + "step": 3633 + }, + { + "epoch": 0.28, + "grad_norm": 1.2880656694114747, + "learning_rate": 1.6852391639307956e-05, + "loss": 0.5896, + "step": 3634 + }, + { + "epoch": 0.28, + "grad_norm": 1.1408451860665263, + "learning_rate": 1.6850561399127902e-05, + "loss": 0.5795, + "step": 3635 + }, + { + "epoch": 0.28, + "grad_norm": 1.2170524512454841, + "learning_rate": 1.6848730726436562e-05, + "loss": 0.6073, + "step": 3636 + }, + { + "epoch": 0.28, + "grad_norm": 1.0355150860397122, + "learning_rate": 1.6846899621349516e-05, + "loss": 0.5646, + "step": 3637 + }, + { + "epoch": 0.28, + "grad_norm": 1.245254042341054, + "learning_rate": 1.6845068083982373e-05, + "loss": 0.6033, + "step": 3638 + }, + { + "epoch": 0.28, + "grad_norm": 1.0928416158426182, + "learning_rate": 1.684323611445076e-05, + "loss": 0.5769, + "step": 3639 + }, + { + "epoch": 0.28, + "grad_norm": 1.1603488954835428, + "learning_rate": 1.684140371287035e-05, + "loss": 0.5192, + "step": 3640 + }, + { + "epoch": 0.28, + "grad_norm": 1.26295590053536, + "learning_rate": 1.6839570879356827e-05, + "loss": 0.6128, + "step": 3641 + }, + { + "epoch": 0.28, + "grad_norm": 1.0958204019651383, + "learning_rate": 1.6837737614025904e-05, + "loss": 0.5739, + "step": 3642 + }, + { + "epoch": 0.28, + "grad_norm": 1.138981929069691, + "learning_rate": 1.683590391699333e-05, + "loss": 0.586, + "step": 3643 + }, + { + "epoch": 0.28, + "grad_norm": 1.2919229386528515, + "learning_rate": 1.683406978837487e-05, + "loss": 0.5916, + "step": 3644 + }, + { + "epoch": 0.28, + "grad_norm": 1.2962799085736711, + "learning_rate": 1.683223522828633e-05, + "loss": 0.6505, + "step": 3645 + }, + { + "epoch": 0.28, + "grad_norm": 1.0693247337276943, + "learning_rate": 1.6830400236843525e-05, + "loss": 0.5842, + "step": 3646 + }, + { + "epoch": 0.28, + "grad_norm": 1.2289810148220723, + "learning_rate": 1.6828564814162318e-05, + "loss": 0.5976, + "step": 3647 + }, + { + "epoch": 0.28, + "grad_norm": 1.1618429439528235, + "learning_rate": 1.682672896035858e-05, + "loss": 0.5479, + "step": 3648 + }, + { + "epoch": 0.28, + "grad_norm": 1.1851398705775793, + "learning_rate": 1.682489267554822e-05, + "loss": 0.6044, + "step": 3649 + }, + { + "epoch": 0.28, + "grad_norm": 1.190137994843335, + "learning_rate": 1.6823055959847177e-05, + "loss": 0.5815, + "step": 3650 + }, + { + "epoch": 0.28, + "grad_norm": 1.242280968562384, + "learning_rate": 1.6821218813371407e-05, + "loss": 0.5952, + "step": 3651 + }, + { + "epoch": 0.28, + "grad_norm": 1.2239719417167219, + "learning_rate": 1.68193812362369e-05, + "loss": 0.6235, + "step": 3652 + }, + { + "epoch": 0.28, + "grad_norm": 1.249072780037141, + "learning_rate": 1.6817543228559675e-05, + "loss": 0.6111, + "step": 3653 + }, + { + "epoch": 0.28, + "grad_norm": 1.255633344228541, + "learning_rate": 1.6815704790455768e-05, + "loss": 0.5582, + "step": 3654 + }, + { + "epoch": 0.28, + "grad_norm": 1.1690479510762544, + "learning_rate": 1.6813865922041258e-05, + "loss": 0.6173, + "step": 3655 + }, + { + "epoch": 0.28, + "grad_norm": 1.1983124231274709, + "learning_rate": 1.6812026623432233e-05, + "loss": 0.5681, + "step": 3656 + }, + { + "epoch": 0.28, + "grad_norm": 1.1530439932470744, + "learning_rate": 1.6810186894744825e-05, + "loss": 0.5886, + "step": 3657 + }, + { + "epoch": 0.28, + "grad_norm": 1.1777359438960062, + "learning_rate": 1.680834673609518e-05, + "loss": 0.6041, + "step": 3658 + }, + { + "epoch": 0.28, + "grad_norm": 1.2249689137154018, + "learning_rate": 1.680650614759948e-05, + "loss": 0.6975, + "step": 3659 + }, + { + "epoch": 0.28, + "grad_norm": 1.2726232815963103, + "learning_rate": 1.6804665129373928e-05, + "loss": 0.583, + "step": 3660 + }, + { + "epoch": 0.28, + "grad_norm": 1.1454927747541033, + "learning_rate": 1.6802823681534765e-05, + "loss": 0.5201, + "step": 3661 + }, + { + "epoch": 0.28, + "grad_norm": 1.1847175324260097, + "learning_rate": 1.680098180419824e-05, + "loss": 0.587, + "step": 3662 + }, + { + "epoch": 0.28, + "grad_norm": 1.212679489850876, + "learning_rate": 1.6799139497480644e-05, + "loss": 0.6453, + "step": 3663 + }, + { + "epoch": 0.28, + "grad_norm": 1.1576328256579598, + "learning_rate": 1.6797296761498295e-05, + "loss": 0.5263, + "step": 3664 + }, + { + "epoch": 0.28, + "grad_norm": 1.2011842804555024, + "learning_rate": 1.6795453596367533e-05, + "loss": 0.6175, + "step": 3665 + }, + { + "epoch": 0.28, + "grad_norm": 1.2626533471824568, + "learning_rate": 1.6793610002204724e-05, + "loss": 0.6154, + "step": 3666 + }, + { + "epoch": 0.28, + "grad_norm": 1.2283046037807088, + "learning_rate": 1.6791765979126267e-05, + "loss": 0.6373, + "step": 3667 + }, + { + "epoch": 0.28, + "grad_norm": 1.1343824885517877, + "learning_rate": 1.6789921527248578e-05, + "loss": 0.5828, + "step": 3668 + }, + { + "epoch": 0.28, + "grad_norm": 1.1749861046801389, + "learning_rate": 1.6788076646688117e-05, + "loss": 0.6208, + "step": 3669 + }, + { + "epoch": 0.28, + "grad_norm": 1.1817832167161415, + "learning_rate": 1.6786231337561352e-05, + "loss": 0.5823, + "step": 3670 + }, + { + "epoch": 0.28, + "grad_norm": 1.2651603222452013, + "learning_rate": 1.6784385599984794e-05, + "loss": 0.6719, + "step": 3671 + }, + { + "epoch": 0.28, + "grad_norm": 1.241392012863191, + "learning_rate": 1.678253943407497e-05, + "loss": 0.566, + "step": 3672 + }, + { + "epoch": 0.28, + "grad_norm": 1.2266400063682432, + "learning_rate": 1.6780692839948433e-05, + "loss": 0.6127, + "step": 3673 + }, + { + "epoch": 0.29, + "grad_norm": 1.2001393912737548, + "learning_rate": 1.6778845817721778e-05, + "loss": 0.5785, + "step": 3674 + }, + { + "epoch": 0.29, + "grad_norm": 1.2902381391063253, + "learning_rate": 1.677699836751161e-05, + "loss": 0.6447, + "step": 3675 + }, + { + "epoch": 0.29, + "grad_norm": 1.23759219856898, + "learning_rate": 1.677515048943457e-05, + "loss": 0.5944, + "step": 3676 + }, + { + "epoch": 0.29, + "grad_norm": 1.2434291751695066, + "learning_rate": 1.6773302183607327e-05, + "loss": 0.6044, + "step": 3677 + }, + { + "epoch": 0.29, + "grad_norm": 1.1192022443121712, + "learning_rate": 1.6771453450146568e-05, + "loss": 0.58, + "step": 3678 + }, + { + "epoch": 0.29, + "grad_norm": 1.1291153657996924, + "learning_rate": 1.676960428916902e-05, + "loss": 0.5502, + "step": 3679 + }, + { + "epoch": 0.29, + "grad_norm": 1.1483491617079862, + "learning_rate": 1.6767754700791425e-05, + "loss": 0.5985, + "step": 3680 + }, + { + "epoch": 0.29, + "grad_norm": 1.3670887938699179, + "learning_rate": 1.676590468513056e-05, + "loss": 0.7028, + "step": 3681 + }, + { + "epoch": 0.29, + "grad_norm": 1.197612270436084, + "learning_rate": 1.6764054242303223e-05, + "loss": 0.5851, + "step": 3682 + }, + { + "epoch": 0.29, + "grad_norm": 1.1398866432282182, + "learning_rate": 1.6762203372426243e-05, + "loss": 0.5857, + "step": 3683 + }, + { + "epoch": 0.29, + "grad_norm": 1.248080496417242, + "learning_rate": 1.6760352075616476e-05, + "loss": 0.5475, + "step": 3684 + }, + { + "epoch": 0.29, + "grad_norm": 1.2040667625528025, + "learning_rate": 1.6758500351990808e-05, + "loss": 0.6096, + "step": 3685 + }, + { + "epoch": 0.29, + "grad_norm": 1.2591083083313643, + "learning_rate": 1.675664820166614e-05, + "loss": 0.6257, + "step": 3686 + }, + { + "epoch": 0.29, + "grad_norm": 1.265467410225235, + "learning_rate": 1.6754795624759414e-05, + "loss": 0.5845, + "step": 3687 + }, + { + "epoch": 0.29, + "grad_norm": 1.1726670195449058, + "learning_rate": 1.6752942621387583e-05, + "loss": 0.605, + "step": 3688 + }, + { + "epoch": 0.29, + "grad_norm": 1.303488357345625, + "learning_rate": 1.6751089191667648e-05, + "loss": 0.5594, + "step": 3689 + }, + { + "epoch": 0.29, + "grad_norm": 1.0873268625400927, + "learning_rate": 1.6749235335716624e-05, + "loss": 0.6074, + "step": 3690 + }, + { + "epoch": 0.29, + "grad_norm": 1.2006165748312496, + "learning_rate": 1.674738105365155e-05, + "loss": 0.5984, + "step": 3691 + }, + { + "epoch": 0.29, + "grad_norm": 1.1164834531727266, + "learning_rate": 1.67455263455895e-05, + "loss": 0.593, + "step": 3692 + }, + { + "epoch": 0.29, + "grad_norm": 1.1351784564141443, + "learning_rate": 1.6743671211647564e-05, + "loss": 0.5658, + "step": 3693 + }, + { + "epoch": 0.29, + "grad_norm": 1.1342483891220712, + "learning_rate": 1.6741815651942873e-05, + "loss": 0.5802, + "step": 3694 + }, + { + "epoch": 0.29, + "grad_norm": 1.1397256833189546, + "learning_rate": 1.673995966659258e-05, + "loss": 0.5906, + "step": 3695 + }, + { + "epoch": 0.29, + "grad_norm": 1.3077400725258972, + "learning_rate": 1.673810325571386e-05, + "loss": 0.6365, + "step": 3696 + }, + { + "epoch": 0.29, + "grad_norm": 1.2617434094738549, + "learning_rate": 1.6736246419423915e-05, + "loss": 0.661, + "step": 3697 + }, + { + "epoch": 0.29, + "grad_norm": 1.0985808364441236, + "learning_rate": 1.6734389157839975e-05, + "loss": 0.5632, + "step": 3698 + }, + { + "epoch": 0.29, + "grad_norm": 1.2235864385416373, + "learning_rate": 1.673253147107931e-05, + "loss": 0.6572, + "step": 3699 + }, + { + "epoch": 0.29, + "grad_norm": 1.2304493251659951, + "learning_rate": 1.6730673359259194e-05, + "loss": 0.618, + "step": 3700 + }, + { + "epoch": 0.29, + "grad_norm": 1.1962495010996768, + "learning_rate": 1.6728814822496944e-05, + "loss": 0.5747, + "step": 3701 + }, + { + "epoch": 0.29, + "grad_norm": 1.2330223108022649, + "learning_rate": 1.6726955860909903e-05, + "loss": 0.5644, + "step": 3702 + }, + { + "epoch": 0.29, + "grad_norm": 1.186116717923621, + "learning_rate": 1.6725096474615423e-05, + "loss": 0.5529, + "step": 3703 + }, + { + "epoch": 0.29, + "grad_norm": 1.1317288707684572, + "learning_rate": 1.6723236663730912e-05, + "loss": 0.5652, + "step": 3704 + }, + { + "epoch": 0.29, + "grad_norm": 1.1843244858892736, + "learning_rate": 1.672137642837378e-05, + "loss": 0.6114, + "step": 3705 + }, + { + "epoch": 0.29, + "grad_norm": 1.2262129558913495, + "learning_rate": 1.6719515768661477e-05, + "loss": 0.6509, + "step": 3706 + }, + { + "epoch": 0.29, + "grad_norm": 1.2056599948010467, + "learning_rate": 1.6717654684711475e-05, + "loss": 0.6074, + "step": 3707 + }, + { + "epoch": 0.29, + "grad_norm": 1.1820732396667126, + "learning_rate": 1.6715793176641275e-05, + "loss": 0.6061, + "step": 3708 + }, + { + "epoch": 0.29, + "grad_norm": 1.096374034324032, + "learning_rate": 1.67139312445684e-05, + "loss": 0.5708, + "step": 3709 + }, + { + "epoch": 0.29, + "grad_norm": 1.0450719422577202, + "learning_rate": 1.671206888861041e-05, + "loss": 0.5936, + "step": 3710 + }, + { + "epoch": 0.29, + "grad_norm": 1.2613089164530626, + "learning_rate": 1.6710206108884884e-05, + "loss": 0.5955, + "step": 3711 + }, + { + "epoch": 0.29, + "grad_norm": 1.2456495874622497, + "learning_rate": 1.6708342905509424e-05, + "loss": 0.6818, + "step": 3712 + }, + { + "epoch": 0.29, + "grad_norm": 1.2108561826905424, + "learning_rate": 1.670647927860166e-05, + "loss": 0.5932, + "step": 3713 + }, + { + "epoch": 0.29, + "grad_norm": 1.344744558220863, + "learning_rate": 1.6704615228279262e-05, + "loss": 0.6318, + "step": 3714 + }, + { + "epoch": 0.29, + "grad_norm": 1.22814940799599, + "learning_rate": 1.6702750754659917e-05, + "loss": 0.6323, + "step": 3715 + }, + { + "epoch": 0.29, + "grad_norm": 1.4351414114362013, + "learning_rate": 1.6700885857861332e-05, + "loss": 0.6453, + "step": 3716 + }, + { + "epoch": 0.29, + "grad_norm": 1.204848899864932, + "learning_rate": 1.6699020538001252e-05, + "loss": 0.533, + "step": 3717 + }, + { + "epoch": 0.29, + "grad_norm": 1.3624670444667628, + "learning_rate": 1.6697154795197442e-05, + "loss": 0.6595, + "step": 3718 + }, + { + "epoch": 0.29, + "grad_norm": 1.2947579534698466, + "learning_rate": 1.6695288629567694e-05, + "loss": 0.5823, + "step": 3719 + }, + { + "epoch": 0.29, + "grad_norm": 1.1919116839808113, + "learning_rate": 1.6693422041229836e-05, + "loss": 0.6098, + "step": 3720 + }, + { + "epoch": 0.29, + "grad_norm": 1.2553045729257943, + "learning_rate": 1.669155503030171e-05, + "loss": 0.5954, + "step": 3721 + }, + { + "epoch": 0.29, + "grad_norm": 1.177709576043585, + "learning_rate": 1.6689687596901192e-05, + "loss": 0.5744, + "step": 3722 + }, + { + "epoch": 0.29, + "grad_norm": 1.2484855537571784, + "learning_rate": 1.668781974114618e-05, + "loss": 0.6338, + "step": 3723 + }, + { + "epoch": 0.29, + "grad_norm": 1.1868125029667411, + "learning_rate": 1.6685951463154602e-05, + "loss": 0.5801, + "step": 3724 + }, + { + "epoch": 0.29, + "grad_norm": 1.2722151336629768, + "learning_rate": 1.6684082763044415e-05, + "loss": 0.6033, + "step": 3725 + }, + { + "epoch": 0.29, + "grad_norm": 1.2374335434919408, + "learning_rate": 1.66822136409336e-05, + "loss": 0.6073, + "step": 3726 + }, + { + "epoch": 0.29, + "grad_norm": 1.354431855082625, + "learning_rate": 1.6680344096940157e-05, + "loss": 0.6317, + "step": 3727 + }, + { + "epoch": 0.29, + "grad_norm": 1.164652476478852, + "learning_rate": 1.667847413118213e-05, + "loss": 0.5498, + "step": 3728 + }, + { + "epoch": 0.29, + "grad_norm": 1.0840897792398052, + "learning_rate": 1.667660374377757e-05, + "loss": 0.5425, + "step": 3729 + }, + { + "epoch": 0.29, + "grad_norm": 1.2490513539717667, + "learning_rate": 1.6674732934844574e-05, + "loss": 0.6148, + "step": 3730 + }, + { + "epoch": 0.29, + "grad_norm": 1.1622798506131633, + "learning_rate": 1.6672861704501247e-05, + "loss": 0.5757, + "step": 3731 + }, + { + "epoch": 0.29, + "grad_norm": 1.2165724055908238, + "learning_rate": 1.6670990052865738e-05, + "loss": 0.5865, + "step": 3732 + }, + { + "epoch": 0.29, + "grad_norm": 1.2117058070147622, + "learning_rate": 1.66691179800562e-05, + "loss": 0.5735, + "step": 3733 + }, + { + "epoch": 0.29, + "grad_norm": 1.1343691423753561, + "learning_rate": 1.6667245486190845e-05, + "loss": 0.574, + "step": 3734 + }, + { + "epoch": 0.29, + "grad_norm": 1.2180225940325038, + "learning_rate": 1.6665372571387882e-05, + "loss": 0.5949, + "step": 3735 + }, + { + "epoch": 0.29, + "grad_norm": 1.3253238084378547, + "learning_rate": 1.6663499235765557e-05, + "loss": 0.6714, + "step": 3736 + }, + { + "epoch": 0.29, + "grad_norm": 1.2749582583008534, + "learning_rate": 1.6661625479442147e-05, + "loss": 0.5866, + "step": 3737 + }, + { + "epoch": 0.29, + "grad_norm": 1.2200862210714658, + "learning_rate": 1.6659751302535952e-05, + "loss": 0.6255, + "step": 3738 + }, + { + "epoch": 0.29, + "grad_norm": 1.097326045751031, + "learning_rate": 1.6657876705165296e-05, + "loss": 0.5646, + "step": 3739 + }, + { + "epoch": 0.29, + "grad_norm": 1.1692795558454008, + "learning_rate": 1.6656001687448532e-05, + "loss": 0.5607, + "step": 3740 + }, + { + "epoch": 0.29, + "grad_norm": 1.1575621299465007, + "learning_rate": 1.6654126249504042e-05, + "loss": 0.6271, + "step": 3741 + }, + { + "epoch": 0.29, + "grad_norm": 1.203734998758077, + "learning_rate": 1.665225039145024e-05, + "loss": 0.6097, + "step": 3742 + }, + { + "epoch": 0.29, + "grad_norm": 1.2704827601596627, + "learning_rate": 1.6650374113405536e-05, + "loss": 0.6201, + "step": 3743 + }, + { + "epoch": 0.29, + "grad_norm": 1.2017453693184021, + "learning_rate": 1.664849741548841e-05, + "loss": 0.5835, + "step": 3744 + }, + { + "epoch": 0.29, + "grad_norm": 1.176355266178128, + "learning_rate": 1.664662029781734e-05, + "loss": 0.5748, + "step": 3745 + }, + { + "epoch": 0.29, + "grad_norm": 1.287058573112449, + "learning_rate": 1.6644742760510837e-05, + "loss": 0.6824, + "step": 3746 + }, + { + "epoch": 0.29, + "grad_norm": 1.13325965211748, + "learning_rate": 1.6642864803687443e-05, + "loss": 0.5917, + "step": 3747 + }, + { + "epoch": 0.29, + "grad_norm": 1.1196763385852007, + "learning_rate": 1.664098642746572e-05, + "loss": 0.5421, + "step": 3748 + }, + { + "epoch": 0.29, + "grad_norm": 1.1415380194268863, + "learning_rate": 1.663910763196426e-05, + "loss": 0.6158, + "step": 3749 + }, + { + "epoch": 0.29, + "grad_norm": 1.157947994648923, + "learning_rate": 1.663722841730168e-05, + "loss": 0.5582, + "step": 3750 + }, + { + "epoch": 0.29, + "grad_norm": 1.1579000710177414, + "learning_rate": 1.663534878359663e-05, + "loss": 0.6173, + "step": 3751 + }, + { + "epoch": 0.29, + "grad_norm": 1.2256714323703495, + "learning_rate": 1.6633468730967778e-05, + "loss": 0.625, + "step": 3752 + }, + { + "epoch": 0.29, + "grad_norm": 1.2527372906766623, + "learning_rate": 1.663158825953382e-05, + "loss": 0.6353, + "step": 3753 + }, + { + "epoch": 0.29, + "grad_norm": 1.3130623203226444, + "learning_rate": 1.662970736941348e-05, + "loss": 0.5916, + "step": 3754 + }, + { + "epoch": 0.29, + "grad_norm": 1.3642249134113675, + "learning_rate": 1.662782606072551e-05, + "loss": 0.6803, + "step": 3755 + }, + { + "epoch": 0.29, + "grad_norm": 1.3849915399534007, + "learning_rate": 1.6625944333588686e-05, + "loss": 0.5918, + "step": 3756 + }, + { + "epoch": 0.29, + "grad_norm": 1.1508884356706663, + "learning_rate": 1.6624062188121808e-05, + "loss": 0.5591, + "step": 3757 + }, + { + "epoch": 0.29, + "grad_norm": 1.1644823482087894, + "learning_rate": 1.662217962444371e-05, + "loss": 0.5957, + "step": 3758 + }, + { + "epoch": 0.29, + "grad_norm": 1.2755559606713933, + "learning_rate": 1.6620296642673248e-05, + "loss": 0.6065, + "step": 3759 + }, + { + "epoch": 0.29, + "grad_norm": 1.2164102739232026, + "learning_rate": 1.6618413242929302e-05, + "loss": 0.575, + "step": 3760 + }, + { + "epoch": 0.29, + "grad_norm": 1.1921120975992539, + "learning_rate": 1.661652942533078e-05, + "loss": 0.5584, + "step": 3761 + }, + { + "epoch": 0.29, + "grad_norm": 1.0089075807300032, + "learning_rate": 1.661464518999662e-05, + "loss": 0.5146, + "step": 3762 + }, + { + "epoch": 0.29, + "grad_norm": 1.2827660964583378, + "learning_rate": 1.6612760537045782e-05, + "loss": 0.5962, + "step": 3763 + }, + { + "epoch": 0.29, + "grad_norm": 1.0936471073254517, + "learning_rate": 1.6610875466597252e-05, + "loss": 0.5608, + "step": 3764 + }, + { + "epoch": 0.29, + "grad_norm": 1.3741402539100864, + "learning_rate": 1.660898997877005e-05, + "loss": 0.6879, + "step": 3765 + }, + { + "epoch": 0.29, + "grad_norm": 1.0385118904296284, + "learning_rate": 1.660710407368321e-05, + "loss": 0.5169, + "step": 3766 + }, + { + "epoch": 0.29, + "grad_norm": 1.1351121908355077, + "learning_rate": 1.66052177514558e-05, + "loss": 0.5849, + "step": 3767 + }, + { + "epoch": 0.29, + "grad_norm": 1.081335291502842, + "learning_rate": 1.660333101220692e-05, + "loss": 0.5491, + "step": 3768 + }, + { + "epoch": 0.29, + "grad_norm": 1.1485172685181053, + "learning_rate": 1.660144385605568e-05, + "loss": 0.5717, + "step": 3769 + }, + { + "epoch": 0.29, + "grad_norm": 1.2980034296581175, + "learning_rate": 1.659955628312123e-05, + "loss": 0.6213, + "step": 3770 + }, + { + "epoch": 0.29, + "grad_norm": 1.1887873648337839, + "learning_rate": 1.6597668293522745e-05, + "loss": 0.6154, + "step": 3771 + }, + { + "epoch": 0.29, + "grad_norm": 1.1951785043646104, + "learning_rate": 1.659577988737942e-05, + "loss": 0.6031, + "step": 3772 + }, + { + "epoch": 0.29, + "grad_norm": 1.1354199622336807, + "learning_rate": 1.659389106481048e-05, + "loss": 0.5819, + "step": 3773 + }, + { + "epoch": 0.29, + "grad_norm": 1.2259338128362396, + "learning_rate": 1.659200182593518e-05, + "loss": 0.5552, + "step": 3774 + }, + { + "epoch": 0.29, + "grad_norm": 1.1859512267995729, + "learning_rate": 1.6590112170872792e-05, + "loss": 0.5845, + "step": 3775 + }, + { + "epoch": 0.29, + "grad_norm": 1.2402792134026883, + "learning_rate": 1.6588222099742624e-05, + "loss": 0.5888, + "step": 3776 + }, + { + "epoch": 0.29, + "grad_norm": 1.1730735244797954, + "learning_rate": 1.6586331612664005e-05, + "loss": 0.5874, + "step": 3777 + }, + { + "epoch": 0.29, + "grad_norm": 1.2310936688160352, + "learning_rate": 1.658444070975629e-05, + "loss": 0.6599, + "step": 3778 + }, + { + "epoch": 0.29, + "grad_norm": 1.2799764677209153, + "learning_rate": 1.658254939113886e-05, + "loss": 0.6245, + "step": 3779 + }, + { + "epoch": 0.29, + "grad_norm": 1.0665915192021374, + "learning_rate": 1.658065765693112e-05, + "loss": 0.5781, + "step": 3780 + }, + { + "epoch": 0.29, + "grad_norm": 1.1837711516024327, + "learning_rate": 1.657876550725252e-05, + "loss": 0.6121, + "step": 3781 + }, + { + "epoch": 0.29, + "grad_norm": 1.2186858820555029, + "learning_rate": 1.6576872942222504e-05, + "loss": 0.6413, + "step": 3782 + }, + { + "epoch": 0.29, + "grad_norm": 1.2648061293154886, + "learning_rate": 1.6574979961960572e-05, + "loss": 0.5985, + "step": 3783 + }, + { + "epoch": 0.29, + "grad_norm": 1.221544143878446, + "learning_rate": 1.657308656658623e-05, + "loss": 0.6057, + "step": 3784 + }, + { + "epoch": 0.29, + "grad_norm": 1.122637758786287, + "learning_rate": 1.6571192756219024e-05, + "loss": 0.567, + "step": 3785 + }, + { + "epoch": 0.29, + "grad_norm": 1.2191033462452205, + "learning_rate": 1.6569298530978516e-05, + "loss": 0.6446, + "step": 3786 + }, + { + "epoch": 0.29, + "grad_norm": 1.2007567642088104, + "learning_rate": 1.6567403890984292e-05, + "loss": 0.6187, + "step": 3787 + }, + { + "epoch": 0.29, + "grad_norm": 1.1891290132263785, + "learning_rate": 1.6565508836355983e-05, + "loss": 0.5518, + "step": 3788 + }, + { + "epoch": 0.29, + "grad_norm": 1.25788282114907, + "learning_rate": 1.6563613367213225e-05, + "loss": 0.5823, + "step": 3789 + }, + { + "epoch": 0.29, + "grad_norm": 1.2776215044232617, + "learning_rate": 1.6561717483675695e-05, + "loss": 0.6181, + "step": 3790 + }, + { + "epoch": 0.29, + "grad_norm": 1.2788741716522585, + "learning_rate": 1.6559821185863082e-05, + "loss": 0.5985, + "step": 3791 + }, + { + "epoch": 0.29, + "grad_norm": 1.2411446662456593, + "learning_rate": 1.6557924473895115e-05, + "loss": 0.6061, + "step": 3792 + }, + { + "epoch": 0.29, + "grad_norm": 1.2374640815993796, + "learning_rate": 1.6556027347891542e-05, + "loss": 0.5734, + "step": 3793 + }, + { + "epoch": 0.29, + "grad_norm": 1.346408963394928, + "learning_rate": 1.6554129807972135e-05, + "loss": 0.6389, + "step": 3794 + }, + { + "epoch": 0.29, + "grad_norm": 1.1061710243267628, + "learning_rate": 1.6552231854256704e-05, + "loss": 0.5716, + "step": 3795 + }, + { + "epoch": 0.29, + "grad_norm": 1.2882410379753582, + "learning_rate": 1.6550333486865068e-05, + "loss": 0.6523, + "step": 3796 + }, + { + "epoch": 0.29, + "grad_norm": 1.2567514244422102, + "learning_rate": 1.654843470591708e-05, + "loss": 0.6178, + "step": 3797 + }, + { + "epoch": 0.29, + "grad_norm": 1.1786103035761748, + "learning_rate": 1.654653551153263e-05, + "loss": 0.5534, + "step": 3798 + }, + { + "epoch": 0.29, + "grad_norm": 1.1545824965867795, + "learning_rate": 1.6544635903831616e-05, + "loss": 0.6195, + "step": 3799 + }, + { + "epoch": 0.29, + "grad_norm": 1.1544990685196097, + "learning_rate": 1.6542735882933967e-05, + "loss": 0.612, + "step": 3800 + }, + { + "epoch": 0.29, + "grad_norm": 1.218692876015038, + "learning_rate": 1.6540835448959648e-05, + "loss": 0.559, + "step": 3801 + }, + { + "epoch": 0.29, + "grad_norm": 1.2635524404411844, + "learning_rate": 1.653893460202864e-05, + "loss": 0.6071, + "step": 3802 + }, + { + "epoch": 0.3, + "grad_norm": 1.2577346398559268, + "learning_rate": 1.6537033342260957e-05, + "loss": 0.6566, + "step": 3803 + }, + { + "epoch": 0.3, + "grad_norm": 1.1768399664981797, + "learning_rate": 1.653513166977663e-05, + "loss": 0.596, + "step": 3804 + }, + { + "epoch": 0.3, + "grad_norm": 1.1698678693581561, + "learning_rate": 1.6533229584695726e-05, + "loss": 0.5922, + "step": 3805 + }, + { + "epoch": 0.3, + "grad_norm": 1.2614141515821948, + "learning_rate": 1.653132708713833e-05, + "loss": 0.6109, + "step": 3806 + }, + { + "epoch": 0.3, + "grad_norm": 1.3486700253114035, + "learning_rate": 1.6529424177224558e-05, + "loss": 0.6106, + "step": 3807 + }, + { + "epoch": 0.3, + "grad_norm": 1.3491381472580792, + "learning_rate": 1.652752085507455e-05, + "loss": 0.6157, + "step": 3808 + }, + { + "epoch": 0.3, + "grad_norm": 1.1173841930181507, + "learning_rate": 1.6525617120808474e-05, + "loss": 0.5551, + "step": 3809 + }, + { + "epoch": 0.3, + "grad_norm": 1.181219205505385, + "learning_rate": 1.6523712974546522e-05, + "loss": 0.6224, + "step": 3810 + }, + { + "epoch": 0.3, + "grad_norm": 1.0870230209699292, + "learning_rate": 1.652180841640891e-05, + "loss": 0.5827, + "step": 3811 + }, + { + "epoch": 0.3, + "grad_norm": 1.1779341646848465, + "learning_rate": 1.6519903446515884e-05, + "loss": 0.6034, + "step": 3812 + }, + { + "epoch": 0.3, + "grad_norm": 1.2670920076149634, + "learning_rate": 1.6517998064987713e-05, + "loss": 0.5926, + "step": 3813 + }, + { + "epoch": 0.3, + "grad_norm": 1.1594448284497245, + "learning_rate": 1.6516092271944702e-05, + "loss": 0.5928, + "step": 3814 + }, + { + "epoch": 0.3, + "grad_norm": 1.2274106731659973, + "learning_rate": 1.6514186067507164e-05, + "loss": 0.6147, + "step": 3815 + }, + { + "epoch": 0.3, + "grad_norm": 1.307674164559868, + "learning_rate": 1.651227945179545e-05, + "loss": 0.6301, + "step": 3816 + }, + { + "epoch": 0.3, + "grad_norm": 1.0382097803781347, + "learning_rate": 1.6510372424929938e-05, + "loss": 0.5734, + "step": 3817 + }, + { + "epoch": 0.3, + "grad_norm": 1.2147871292510144, + "learning_rate": 1.650846498703102e-05, + "loss": 0.6246, + "step": 3818 + }, + { + "epoch": 0.3, + "grad_norm": 1.1487076110822814, + "learning_rate": 1.6506557138219136e-05, + "loss": 0.5224, + "step": 3819 + }, + { + "epoch": 0.3, + "grad_norm": 1.3060120123943155, + "learning_rate": 1.6504648878614726e-05, + "loss": 0.5835, + "step": 3820 + }, + { + "epoch": 0.3, + "grad_norm": 1.0709540678801464, + "learning_rate": 1.6502740208338273e-05, + "loss": 0.5895, + "step": 3821 + }, + { + "epoch": 0.3, + "grad_norm": 1.2362763934890642, + "learning_rate": 1.650083112751028e-05, + "loss": 0.6449, + "step": 3822 + }, + { + "epoch": 0.3, + "grad_norm": 1.1390627783839091, + "learning_rate": 1.6498921636251278e-05, + "loss": 0.5832, + "step": 3823 + }, + { + "epoch": 0.3, + "grad_norm": 1.10270155865618, + "learning_rate": 1.6497011734681824e-05, + "loss": 0.5772, + "step": 3824 + }, + { + "epoch": 0.3, + "grad_norm": 1.3061348202679655, + "learning_rate": 1.64951014229225e-05, + "loss": 0.6041, + "step": 3825 + }, + { + "epoch": 0.3, + "grad_norm": 1.150218024525958, + "learning_rate": 1.6493190701093913e-05, + "loss": 0.5507, + "step": 3826 + }, + { + "epoch": 0.3, + "grad_norm": 1.1654001525338058, + "learning_rate": 1.6491279569316697e-05, + "loss": 0.5608, + "step": 3827 + }, + { + "epoch": 0.3, + "grad_norm": 1.2742798248566691, + "learning_rate": 1.6489368027711507e-05, + "loss": 0.5654, + "step": 3828 + }, + { + "epoch": 0.3, + "grad_norm": 1.2060025964522365, + "learning_rate": 1.648745607639904e-05, + "loss": 0.6007, + "step": 3829 + }, + { + "epoch": 0.3, + "grad_norm": 1.2790790867516384, + "learning_rate": 1.6485543715499994e-05, + "loss": 0.6137, + "step": 3830 + }, + { + "epoch": 0.3, + "grad_norm": 1.211047505950281, + "learning_rate": 1.6483630945135112e-05, + "loss": 0.632, + "step": 3831 + }, + { + "epoch": 0.3, + "grad_norm": 1.1557007077692802, + "learning_rate": 1.648171776542516e-05, + "loss": 0.5673, + "step": 3832 + }, + { + "epoch": 0.3, + "grad_norm": 1.1948993442552436, + "learning_rate": 1.647980417649092e-05, + "loss": 0.6027, + "step": 3833 + }, + { + "epoch": 0.3, + "grad_norm": 1.250139705480305, + "learning_rate": 1.6477890178453216e-05, + "loss": 0.5999, + "step": 3834 + }, + { + "epoch": 0.3, + "grad_norm": 1.0587319146675518, + "learning_rate": 1.6475975771432883e-05, + "loss": 0.5594, + "step": 3835 + }, + { + "epoch": 0.3, + "grad_norm": 1.1433898129336177, + "learning_rate": 1.6474060955550783e-05, + "loss": 0.5648, + "step": 3836 + }, + { + "epoch": 0.3, + "grad_norm": 1.1182726411699961, + "learning_rate": 1.647214573092782e-05, + "loss": 0.5938, + "step": 3837 + }, + { + "epoch": 0.3, + "grad_norm": 1.2268435891124432, + "learning_rate": 1.64702300976849e-05, + "loss": 0.656, + "step": 3838 + }, + { + "epoch": 0.3, + "grad_norm": 1.1606531074666053, + "learning_rate": 1.6468314055942977e-05, + "loss": 0.5799, + "step": 3839 + }, + { + "epoch": 0.3, + "grad_norm": 1.1225013535795143, + "learning_rate": 1.646639760582301e-05, + "loss": 0.6052, + "step": 3840 + }, + { + "epoch": 0.3, + "grad_norm": 1.3084316324048235, + "learning_rate": 1.6464480747446e-05, + "loss": 0.6426, + "step": 3841 + }, + { + "epoch": 0.3, + "grad_norm": 1.2054317209090708, + "learning_rate": 1.646256348093297e-05, + "loss": 0.5734, + "step": 3842 + }, + { + "epoch": 0.3, + "grad_norm": 1.2244733768371143, + "learning_rate": 1.6460645806404967e-05, + "loss": 0.5561, + "step": 3843 + }, + { + "epoch": 0.3, + "grad_norm": 1.147653221090019, + "learning_rate": 1.645872772398306e-05, + "loss": 0.6205, + "step": 3844 + }, + { + "epoch": 0.3, + "grad_norm": 1.1980938932373515, + "learning_rate": 1.645680923378835e-05, + "loss": 0.5896, + "step": 3845 + }, + { + "epoch": 0.3, + "grad_norm": 1.0626539231050376, + "learning_rate": 1.6454890335941957e-05, + "loss": 0.5643, + "step": 3846 + }, + { + "epoch": 0.3, + "grad_norm": 1.2680301653736579, + "learning_rate": 1.645297103056504e-05, + "loss": 0.6094, + "step": 3847 + }, + { + "epoch": 0.3, + "grad_norm": 1.213017946892014, + "learning_rate": 1.6451051317778764e-05, + "loss": 0.6437, + "step": 3848 + }, + { + "epoch": 0.3, + "grad_norm": 1.265063502711842, + "learning_rate": 1.6449131197704337e-05, + "loss": 0.6289, + "step": 3849 + }, + { + "epoch": 0.3, + "grad_norm": 1.218225366206749, + "learning_rate": 1.6447210670462985e-05, + "loss": 0.636, + "step": 3850 + }, + { + "epoch": 0.3, + "grad_norm": 1.20334291342482, + "learning_rate": 1.644528973617596e-05, + "loss": 0.6317, + "step": 3851 + }, + { + "epoch": 0.3, + "grad_norm": 1.1452117042892074, + "learning_rate": 1.644336839496454e-05, + "loss": 0.5835, + "step": 3852 + }, + { + "epoch": 0.3, + "grad_norm": 1.3163676270385396, + "learning_rate": 1.6441446646950027e-05, + "loss": 0.6678, + "step": 3853 + }, + { + "epoch": 0.3, + "grad_norm": 1.1686447891777894, + "learning_rate": 1.643952449225376e-05, + "loss": 0.5849, + "step": 3854 + }, + { + "epoch": 0.3, + "grad_norm": 1.1595691261330412, + "learning_rate": 1.6437601930997083e-05, + "loss": 0.5801, + "step": 3855 + }, + { + "epoch": 0.3, + "grad_norm": 1.2232351181993006, + "learning_rate": 1.643567896330138e-05, + "loss": 0.5258, + "step": 3856 + }, + { + "epoch": 0.3, + "grad_norm": 1.235197300382614, + "learning_rate": 1.6433755589288067e-05, + "loss": 0.6101, + "step": 3857 + }, + { + "epoch": 0.3, + "grad_norm": 1.1154672142453195, + "learning_rate": 1.643183180907857e-05, + "loss": 0.5405, + "step": 3858 + }, + { + "epoch": 0.3, + "grad_norm": 1.10278393279012, + "learning_rate": 1.642990762279434e-05, + "loss": 0.5437, + "step": 3859 + }, + { + "epoch": 0.3, + "grad_norm": 1.223961958643096, + "learning_rate": 1.6427983030556872e-05, + "loss": 0.6144, + "step": 3860 + }, + { + "epoch": 0.3, + "grad_norm": 1.3024641268543844, + "learning_rate": 1.6426058032487675e-05, + "loss": 0.6771, + "step": 3861 + }, + { + "epoch": 0.3, + "grad_norm": 1.1650502667204066, + "learning_rate": 1.6424132628708276e-05, + "loss": 0.6146, + "step": 3862 + }, + { + "epoch": 0.3, + "grad_norm": 1.1345842909251438, + "learning_rate": 1.642220681934024e-05, + "loss": 0.55, + "step": 3863 + }, + { + "epoch": 0.3, + "grad_norm": 1.1848294697157156, + "learning_rate": 1.6420280604505152e-05, + "loss": 0.543, + "step": 3864 + }, + { + "epoch": 0.3, + "grad_norm": 1.2912866074283447, + "learning_rate": 1.6418353984324628e-05, + "loss": 0.657, + "step": 3865 + }, + { + "epoch": 0.3, + "grad_norm": 1.1422981114715618, + "learning_rate": 1.6416426958920303e-05, + "loss": 0.612, + "step": 3866 + }, + { + "epoch": 0.3, + "grad_norm": 1.1744473009657208, + "learning_rate": 1.6414499528413838e-05, + "loss": 0.5618, + "step": 3867 + }, + { + "epoch": 0.3, + "grad_norm": 1.1696305206713205, + "learning_rate": 1.6412571692926924e-05, + "loss": 0.5347, + "step": 3868 + }, + { + "epoch": 0.3, + "grad_norm": 1.2394894262480667, + "learning_rate": 1.6410643452581274e-05, + "loss": 0.5943, + "step": 3869 + }, + { + "epoch": 0.3, + "grad_norm": 1.1992516528657549, + "learning_rate": 1.640871480749863e-05, + "loss": 0.6021, + "step": 3870 + }, + { + "epoch": 0.3, + "grad_norm": 1.2168749543882105, + "learning_rate": 1.6406785757800753e-05, + "loss": 0.6033, + "step": 3871 + }, + { + "epoch": 0.3, + "grad_norm": 1.2110106907537241, + "learning_rate": 1.6404856303609435e-05, + "loss": 0.6042, + "step": 3872 + }, + { + "epoch": 0.3, + "grad_norm": 1.0960628034773212, + "learning_rate": 1.64029264450465e-05, + "loss": 0.5802, + "step": 3873 + }, + { + "epoch": 0.3, + "grad_norm": 1.1151483238679345, + "learning_rate": 1.640099618223378e-05, + "loss": 0.601, + "step": 3874 + }, + { + "epoch": 0.3, + "grad_norm": 1.2044190222668023, + "learning_rate": 1.6399065515293145e-05, + "loss": 0.6263, + "step": 3875 + }, + { + "epoch": 0.3, + "grad_norm": 1.1307449414690194, + "learning_rate": 1.6397134444346487e-05, + "loss": 0.5414, + "step": 3876 + }, + { + "epoch": 0.3, + "grad_norm": 1.253980640331903, + "learning_rate": 1.6395202969515735e-05, + "loss": 0.6337, + "step": 3877 + }, + { + "epoch": 0.3, + "grad_norm": 1.2784498832436724, + "learning_rate": 1.639327109092282e-05, + "loss": 0.614, + "step": 3878 + }, + { + "epoch": 0.3, + "grad_norm": 1.3343225723437493, + "learning_rate": 1.639133880868971e-05, + "loss": 0.6371, + "step": 3879 + }, + { + "epoch": 0.3, + "grad_norm": 1.1959331123026353, + "learning_rate": 1.6389406122938415e-05, + "loss": 0.5897, + "step": 3880 + }, + { + "epoch": 0.3, + "grad_norm": 1.174967385914844, + "learning_rate": 1.6387473033790944e-05, + "loss": 0.5861, + "step": 3881 + }, + { + "epoch": 0.3, + "grad_norm": 1.1046235320955742, + "learning_rate": 1.6385539541369346e-05, + "loss": 0.551, + "step": 3882 + }, + { + "epoch": 0.3, + "grad_norm": 1.2528323033842708, + "learning_rate": 1.6383605645795687e-05, + "loss": 0.6035, + "step": 3883 + }, + { + "epoch": 0.3, + "grad_norm": 1.2644852580836072, + "learning_rate": 1.6381671347192073e-05, + "loss": 0.5991, + "step": 3884 + }, + { + "epoch": 0.3, + "grad_norm": 1.2052585955101447, + "learning_rate": 1.6379736645680622e-05, + "loss": 0.561, + "step": 3885 + }, + { + "epoch": 0.3, + "grad_norm": 1.3026870650318307, + "learning_rate": 1.6377801541383477e-05, + "loss": 0.6556, + "step": 3886 + }, + { + "epoch": 0.3, + "grad_norm": 1.350319042300315, + "learning_rate": 1.637586603442282e-05, + "loss": 0.6403, + "step": 3887 + }, + { + "epoch": 0.3, + "grad_norm": 1.1535409688829779, + "learning_rate": 1.6373930124920844e-05, + "loss": 0.5629, + "step": 3888 + }, + { + "epoch": 0.3, + "grad_norm": 1.300501337950479, + "learning_rate": 1.6371993812999773e-05, + "loss": 0.5918, + "step": 3889 + }, + { + "epoch": 0.3, + "grad_norm": 1.4052594829022873, + "learning_rate": 1.637005709878186e-05, + "loss": 0.6102, + "step": 3890 + }, + { + "epoch": 0.3, + "grad_norm": 1.2750333370263558, + "learning_rate": 1.6368119982389373e-05, + "loss": 0.6711, + "step": 3891 + }, + { + "epoch": 0.3, + "grad_norm": 1.2093439260822887, + "learning_rate": 1.6366182463944618e-05, + "loss": 0.6573, + "step": 3892 + }, + { + "epoch": 0.3, + "grad_norm": 1.2163617625152745, + "learning_rate": 1.636424454356992e-05, + "loss": 0.5821, + "step": 3893 + }, + { + "epoch": 0.3, + "grad_norm": 1.1146755120969853, + "learning_rate": 1.636230622138763e-05, + "loss": 0.6064, + "step": 3894 + }, + { + "epoch": 0.3, + "grad_norm": 1.179718977148219, + "learning_rate": 1.636036749752012e-05, + "loss": 0.5514, + "step": 3895 + }, + { + "epoch": 0.3, + "grad_norm": 1.3208943726949323, + "learning_rate": 1.63584283720898e-05, + "loss": 0.6838, + "step": 3896 + }, + { + "epoch": 0.3, + "grad_norm": 1.3103264567866675, + "learning_rate": 1.6356488845219086e-05, + "loss": 0.5689, + "step": 3897 + }, + { + "epoch": 0.3, + "grad_norm": 1.3054659228748, + "learning_rate": 1.6354548917030437e-05, + "loss": 0.5876, + "step": 3898 + }, + { + "epoch": 0.3, + "grad_norm": 1.1243838106226864, + "learning_rate": 1.635260858764633e-05, + "loss": 0.5186, + "step": 3899 + }, + { + "epoch": 0.3, + "grad_norm": 1.2194864542209984, + "learning_rate": 1.6350667857189268e-05, + "loss": 0.6127, + "step": 3900 + }, + { + "epoch": 0.3, + "grad_norm": 1.275604744151813, + "learning_rate": 1.634872672578178e-05, + "loss": 0.6614, + "step": 3901 + }, + { + "epoch": 0.3, + "grad_norm": 1.179660064176127, + "learning_rate": 1.6346785193546418e-05, + "loss": 0.5828, + "step": 3902 + }, + { + "epoch": 0.3, + "grad_norm": 1.1594623584172743, + "learning_rate": 1.634484326060576e-05, + "loss": 0.5918, + "step": 3903 + }, + { + "epoch": 0.3, + "grad_norm": 1.2652293399193577, + "learning_rate": 1.6342900927082415e-05, + "loss": 0.5914, + "step": 3904 + }, + { + "epoch": 0.3, + "grad_norm": 1.2085145891103632, + "learning_rate": 1.634095819309901e-05, + "loss": 0.5946, + "step": 3905 + }, + { + "epoch": 0.3, + "grad_norm": 1.2495752566641438, + "learning_rate": 1.6339015058778196e-05, + "loss": 0.5987, + "step": 3906 + }, + { + "epoch": 0.3, + "grad_norm": 1.3488035321859129, + "learning_rate": 1.6337071524242657e-05, + "loss": 0.6658, + "step": 3907 + }, + { + "epoch": 0.3, + "grad_norm": 1.0699372016446838, + "learning_rate": 1.6335127589615097e-05, + "loss": 0.5674, + "step": 3908 + }, + { + "epoch": 0.3, + "grad_norm": 1.1791559499696191, + "learning_rate": 1.6333183255018247e-05, + "loss": 0.5647, + "step": 3909 + }, + { + "epoch": 0.3, + "grad_norm": 1.2328335275536846, + "learning_rate": 1.6331238520574866e-05, + "loss": 0.6384, + "step": 3910 + }, + { + "epoch": 0.3, + "grad_norm": 1.1918939311499923, + "learning_rate": 1.632929338640773e-05, + "loss": 0.6079, + "step": 3911 + }, + { + "epoch": 0.3, + "grad_norm": 1.3075585213250527, + "learning_rate": 1.632734785263965e-05, + "loss": 0.6057, + "step": 3912 + }, + { + "epoch": 0.3, + "grad_norm": 1.167633161527098, + "learning_rate": 1.6325401919393455e-05, + "loss": 0.5506, + "step": 3913 + }, + { + "epoch": 0.3, + "grad_norm": 1.248550910238651, + "learning_rate": 1.6323455586792e-05, + "loss": 0.6152, + "step": 3914 + }, + { + "epoch": 0.3, + "grad_norm": 1.0655156143741384, + "learning_rate": 1.632150885495817e-05, + "loss": 0.5355, + "step": 3915 + }, + { + "epoch": 0.3, + "grad_norm": 1.1881358803256417, + "learning_rate": 1.631956172401487e-05, + "loss": 0.6435, + "step": 3916 + }, + { + "epoch": 0.3, + "grad_norm": 1.2162741430695418, + "learning_rate": 1.6317614194085033e-05, + "loss": 0.5917, + "step": 3917 + }, + { + "epoch": 0.3, + "grad_norm": 1.5600653945225398, + "learning_rate": 1.631566626529162e-05, + "loss": 0.5381, + "step": 3918 + }, + { + "epoch": 0.3, + "grad_norm": 1.076173093487003, + "learning_rate": 1.6313717937757612e-05, + "loss": 0.5388, + "step": 3919 + }, + { + "epoch": 0.3, + "grad_norm": 1.1156762845943378, + "learning_rate": 1.6311769211606012e-05, + "loss": 0.5769, + "step": 3920 + }, + { + "epoch": 0.3, + "grad_norm": 1.27629685737159, + "learning_rate": 1.630982008695986e-05, + "loss": 0.6195, + "step": 3921 + }, + { + "epoch": 0.3, + "grad_norm": 1.2025496853657713, + "learning_rate": 1.6307870563942212e-05, + "loss": 0.5878, + "step": 3922 + }, + { + "epoch": 0.3, + "grad_norm": 1.2169623346635952, + "learning_rate": 1.630592064267615e-05, + "loss": 0.5994, + "step": 3923 + }, + { + "epoch": 0.3, + "grad_norm": 1.2369774533979363, + "learning_rate": 1.6303970323284784e-05, + "loss": 0.633, + "step": 3924 + }, + { + "epoch": 0.3, + "grad_norm": 1.119677669429823, + "learning_rate": 1.630201960589125e-05, + "loss": 0.5875, + "step": 3925 + }, + { + "epoch": 0.3, + "grad_norm": 1.1193954949816354, + "learning_rate": 1.63000684906187e-05, + "loss": 0.5378, + "step": 3926 + }, + { + "epoch": 0.3, + "grad_norm": 1.1732046594741175, + "learning_rate": 1.6298116977590323e-05, + "loss": 0.5885, + "step": 3927 + }, + { + "epoch": 0.3, + "grad_norm": 1.1824317994723597, + "learning_rate": 1.6296165066929327e-05, + "loss": 0.5893, + "step": 3928 + }, + { + "epoch": 0.3, + "grad_norm": 1.1695816487860884, + "learning_rate": 1.6294212758758945e-05, + "loss": 0.6354, + "step": 3929 + }, + { + "epoch": 0.3, + "grad_norm": 1.1942416611624476, + "learning_rate": 1.629226005320244e-05, + "loss": 0.6118, + "step": 3930 + }, + { + "epoch": 0.3, + "grad_norm": 1.1750110625699934, + "learning_rate": 1.6290306950383095e-05, + "loss": 0.6372, + "step": 3931 + }, + { + "epoch": 0.31, + "grad_norm": 1.2056301837056256, + "learning_rate": 1.6288353450424216e-05, + "loss": 0.5981, + "step": 3932 + }, + { + "epoch": 0.31, + "grad_norm": 1.2117346324162617, + "learning_rate": 1.628639955344914e-05, + "loss": 0.5877, + "step": 3933 + }, + { + "epoch": 0.31, + "grad_norm": 1.21289696438372, + "learning_rate": 1.628444525958123e-05, + "loss": 0.6124, + "step": 3934 + }, + { + "epoch": 0.31, + "grad_norm": 1.1563761358794684, + "learning_rate": 1.6282490568943867e-05, + "loss": 0.5792, + "step": 3935 + }, + { + "epoch": 0.31, + "grad_norm": 1.254263996179238, + "learning_rate": 1.6280535481660454e-05, + "loss": 0.642, + "step": 3936 + }, + { + "epoch": 0.31, + "grad_norm": 1.2512793669555577, + "learning_rate": 1.627857999785444e-05, + "loss": 0.5883, + "step": 3937 + }, + { + "epoch": 0.31, + "grad_norm": 1.1928629990067652, + "learning_rate": 1.6276624117649273e-05, + "loss": 0.6174, + "step": 3938 + }, + { + "epoch": 0.31, + "grad_norm": 1.2099964546120145, + "learning_rate": 1.6274667841168445e-05, + "loss": 0.572, + "step": 3939 + }, + { + "epoch": 0.31, + "grad_norm": 1.1618369929277412, + "learning_rate": 1.6272711168535465e-05, + "loss": 0.6543, + "step": 3940 + }, + { + "epoch": 0.31, + "grad_norm": 1.2139831634874838, + "learning_rate": 1.6270754099873866e-05, + "loss": 0.572, + "step": 3941 + }, + { + "epoch": 0.31, + "grad_norm": 1.2067460892991102, + "learning_rate": 1.62687966353072e-05, + "loss": 0.6673, + "step": 3942 + }, + { + "epoch": 0.31, + "grad_norm": 1.1363861541250981, + "learning_rate": 1.6266838774959065e-05, + "loss": 0.5476, + "step": 3943 + }, + { + "epoch": 0.31, + "grad_norm": 1.159579149559187, + "learning_rate": 1.6264880518953068e-05, + "loss": 0.5904, + "step": 3944 + }, + { + "epoch": 0.31, + "grad_norm": 1.0840394702003568, + "learning_rate": 1.626292186741284e-05, + "loss": 0.5894, + "step": 3945 + }, + { + "epoch": 0.31, + "grad_norm": 1.3137632829918364, + "learning_rate": 1.626096282046204e-05, + "loss": 0.6124, + "step": 3946 + }, + { + "epoch": 0.31, + "grad_norm": 1.199484929170182, + "learning_rate": 1.6259003378224358e-05, + "loss": 0.5361, + "step": 3947 + }, + { + "epoch": 0.31, + "grad_norm": 1.126221523454137, + "learning_rate": 1.6257043540823498e-05, + "loss": 0.5546, + "step": 3948 + }, + { + "epoch": 0.31, + "grad_norm": 1.1876042972740064, + "learning_rate": 1.6255083308383195e-05, + "loss": 0.5831, + "step": 3949 + }, + { + "epoch": 0.31, + "grad_norm": 1.0887787239849271, + "learning_rate": 1.6253122681027214e-05, + "loss": 0.6001, + "step": 3950 + }, + { + "epoch": 0.31, + "grad_norm": 1.2446340304149863, + "learning_rate": 1.6251161658879338e-05, + "loss": 0.6371, + "step": 3951 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377869976433886, + "learning_rate": 1.6249200242063368e-05, + "loss": 0.5563, + "step": 3952 + }, + { + "epoch": 0.31, + "grad_norm": 1.1588291065492486, + "learning_rate": 1.624723843070315e-05, + "loss": 0.638, + "step": 3953 + }, + { + "epoch": 0.31, + "grad_norm": 1.111704462620387, + "learning_rate": 1.6245276224922538e-05, + "loss": 0.602, + "step": 3954 + }, + { + "epoch": 0.31, + "grad_norm": 1.234620564818056, + "learning_rate": 1.6243313624845417e-05, + "loss": 0.5974, + "step": 3955 + }, + { + "epoch": 0.31, + "grad_norm": 1.2223304582604837, + "learning_rate": 1.6241350630595697e-05, + "loss": 0.5917, + "step": 3956 + }, + { + "epoch": 0.31, + "grad_norm": 1.1517162101364646, + "learning_rate": 1.623938724229731e-05, + "loss": 0.5855, + "step": 3957 + }, + { + "epoch": 0.31, + "grad_norm": 1.271761909349612, + "learning_rate": 1.623742346007421e-05, + "loss": 0.6164, + "step": 3958 + }, + { + "epoch": 0.31, + "grad_norm": 1.2746847866335282, + "learning_rate": 1.6235459284050393e-05, + "loss": 0.6274, + "step": 3959 + }, + { + "epoch": 0.31, + "grad_norm": 1.2029244887482289, + "learning_rate": 1.6233494714349854e-05, + "loss": 0.6257, + "step": 3960 + }, + { + "epoch": 0.31, + "grad_norm": 1.1819769262524185, + "learning_rate": 1.623152975109664e-05, + "loss": 0.6301, + "step": 3961 + }, + { + "epoch": 0.31, + "grad_norm": 1.2891098476153031, + "learning_rate": 1.6229564394414796e-05, + "loss": 0.6508, + "step": 3962 + }, + { + "epoch": 0.31, + "grad_norm": 1.2121481965142733, + "learning_rate": 1.6227598644428417e-05, + "loss": 0.5883, + "step": 3963 + }, + { + "epoch": 0.31, + "grad_norm": 1.2397736421989687, + "learning_rate": 1.6225632501261603e-05, + "loss": 0.6003, + "step": 3964 + }, + { + "epoch": 0.31, + "grad_norm": 1.241069074277615, + "learning_rate": 1.622366596503849e-05, + "loss": 0.5807, + "step": 3965 + }, + { + "epoch": 0.31, + "grad_norm": 1.2458192050519732, + "learning_rate": 1.622169903588323e-05, + "loss": 0.581, + "step": 3966 + }, + { + "epoch": 0.31, + "grad_norm": 1.15965748350523, + "learning_rate": 1.6219731713920017e-05, + "loss": 0.583, + "step": 3967 + }, + { + "epoch": 0.31, + "grad_norm": 1.2442792638549236, + "learning_rate": 1.621776399927305e-05, + "loss": 0.5626, + "step": 3968 + }, + { + "epoch": 0.31, + "grad_norm": 1.1158338762604407, + "learning_rate": 1.6215795892066556e-05, + "loss": 0.5716, + "step": 3969 + }, + { + "epoch": 0.31, + "grad_norm": 1.216933486152108, + "learning_rate": 1.6213827392424802e-05, + "loss": 0.5675, + "step": 3970 + }, + { + "epoch": 0.31, + "grad_norm": 1.3129663547147643, + "learning_rate": 1.6211858500472068e-05, + "loss": 0.6168, + "step": 3971 + }, + { + "epoch": 0.31, + "grad_norm": 1.5181623632828118, + "learning_rate": 1.6209889216332657e-05, + "loss": 0.5845, + "step": 3972 + }, + { + "epoch": 0.31, + "grad_norm": 1.2594047561802864, + "learning_rate": 1.62079195401309e-05, + "loss": 0.6096, + "step": 3973 + }, + { + "epoch": 0.31, + "grad_norm": 1.1115418883893513, + "learning_rate": 1.6205949471991154e-05, + "loss": 0.6022, + "step": 3974 + }, + { + "epoch": 0.31, + "grad_norm": 1.2668267655000156, + "learning_rate": 1.6203979012037797e-05, + "loss": 0.6232, + "step": 3975 + }, + { + "epoch": 0.31, + "grad_norm": 1.1391563366410193, + "learning_rate": 1.620200816039524e-05, + "loss": 0.5601, + "step": 3976 + }, + { + "epoch": 0.31, + "grad_norm": 1.2115418925487111, + "learning_rate": 1.6200036917187908e-05, + "loss": 0.6136, + "step": 3977 + }, + { + "epoch": 0.31, + "grad_norm": 1.2694049948487522, + "learning_rate": 1.6198065282540258e-05, + "loss": 0.6167, + "step": 3978 + }, + { + "epoch": 0.31, + "grad_norm": 1.2722178510223396, + "learning_rate": 1.619609325657677e-05, + "loss": 0.6318, + "step": 3979 + }, + { + "epoch": 0.31, + "grad_norm": 1.3864098863092906, + "learning_rate": 1.619412083942195e-05, + "loss": 0.6064, + "step": 3980 + }, + { + "epoch": 0.31, + "grad_norm": 1.1347166172284997, + "learning_rate": 1.6192148031200315e-05, + "loss": 0.5682, + "step": 3981 + }, + { + "epoch": 0.31, + "grad_norm": 1.1315261377340633, + "learning_rate": 1.6190174832036434e-05, + "loss": 0.5535, + "step": 3982 + }, + { + "epoch": 0.31, + "grad_norm": 1.119340436051421, + "learning_rate": 1.6188201242054875e-05, + "loss": 0.5955, + "step": 3983 + }, + { + "epoch": 0.31, + "grad_norm": 1.2052149764493065, + "learning_rate": 1.6186227261380247e-05, + "loss": 0.6179, + "step": 3984 + }, + { + "epoch": 0.31, + "grad_norm": 1.170635228802279, + "learning_rate": 1.6184252890137175e-05, + "loss": 0.6167, + "step": 3985 + }, + { + "epoch": 0.31, + "grad_norm": 1.1688227768541013, + "learning_rate": 1.618227812845031e-05, + "loss": 0.5885, + "step": 3986 + }, + { + "epoch": 0.31, + "grad_norm": 1.2806564444855968, + "learning_rate": 1.6180302976444332e-05, + "loss": 0.6695, + "step": 3987 + }, + { + "epoch": 0.31, + "grad_norm": 1.1579952471574613, + "learning_rate": 1.617832743424394e-05, + "loss": 0.5816, + "step": 3988 + }, + { + "epoch": 0.31, + "grad_norm": 1.1821707553065879, + "learning_rate": 1.6176351501973865e-05, + "loss": 0.6079, + "step": 3989 + }, + { + "epoch": 0.31, + "grad_norm": 1.191993693626286, + "learning_rate": 1.6174375179758854e-05, + "loss": 0.6214, + "step": 3990 + }, + { + "epoch": 0.31, + "grad_norm": 1.2450894221781705, + "learning_rate": 1.6172398467723674e-05, + "loss": 0.5836, + "step": 3991 + }, + { + "epoch": 0.31, + "grad_norm": 1.2226072551429032, + "learning_rate": 1.617042136599314e-05, + "loss": 0.6354, + "step": 3992 + }, + { + "epoch": 0.31, + "grad_norm": 1.1979623979325693, + "learning_rate": 1.6168443874692066e-05, + "loss": 0.5994, + "step": 3993 + }, + { + "epoch": 0.31, + "grad_norm": 1.4221004265466641, + "learning_rate": 1.616646599394531e-05, + "loss": 0.5728, + "step": 3994 + }, + { + "epoch": 0.31, + "grad_norm": 1.2873378892337566, + "learning_rate": 1.616448772387774e-05, + "loss": 0.6059, + "step": 3995 + }, + { + "epoch": 0.31, + "grad_norm": 1.061620235855083, + "learning_rate": 1.6162509064614253e-05, + "loss": 0.5424, + "step": 3996 + }, + { + "epoch": 0.31, + "grad_norm": 1.1789231508331548, + "learning_rate": 1.6160530016279774e-05, + "loss": 0.4952, + "step": 3997 + }, + { + "epoch": 0.31, + "grad_norm": 1.1910281222279882, + "learning_rate": 1.6158550578999252e-05, + "loss": 0.6121, + "step": 3998 + }, + { + "epoch": 0.31, + "grad_norm": 1.125059867961342, + "learning_rate": 1.6156570752897658e-05, + "loss": 0.5408, + "step": 3999 + }, + { + "epoch": 0.31, + "grad_norm": 1.2594834595730304, + "learning_rate": 1.6154590538099993e-05, + "loss": 0.5299, + "step": 4000 + }, + { + "epoch": 0.31, + "grad_norm": 1.153355816739578, + "learning_rate": 1.6152609934731266e-05, + "loss": 0.5875, + "step": 4001 + }, + { + "epoch": 0.31, + "grad_norm": 1.139540431012989, + "learning_rate": 1.615062894291654e-05, + "loss": 0.5971, + "step": 4002 + }, + { + "epoch": 0.31, + "grad_norm": 1.1937285755891243, + "learning_rate": 1.6148647562780865e-05, + "loss": 0.5594, + "step": 4003 + }, + { + "epoch": 0.31, + "grad_norm": 1.1929602821913936, + "learning_rate": 1.6146665794449352e-05, + "loss": 0.626, + "step": 4004 + }, + { + "epoch": 0.31, + "grad_norm": 1.2669126765290302, + "learning_rate": 1.6144683638047115e-05, + "loss": 0.6363, + "step": 4005 + }, + { + "epoch": 0.31, + "grad_norm": 1.1527505964626341, + "learning_rate": 1.6142701093699296e-05, + "loss": 0.5758, + "step": 4006 + }, + { + "epoch": 0.31, + "grad_norm": 1.2021356949181223, + "learning_rate": 1.6140718161531066e-05, + "loss": 0.5874, + "step": 4007 + }, + { + "epoch": 0.31, + "grad_norm": 1.2117869689351568, + "learning_rate": 1.613873484166762e-05, + "loss": 0.6391, + "step": 4008 + }, + { + "epoch": 0.31, + "grad_norm": 1.1044450744780192, + "learning_rate": 1.6136751134234163e-05, + "loss": 0.5521, + "step": 4009 + }, + { + "epoch": 0.31, + "grad_norm": 1.2154085863952253, + "learning_rate": 1.6134767039355953e-05, + "loss": 0.6514, + "step": 4010 + }, + { + "epoch": 0.31, + "grad_norm": 1.178474914636623, + "learning_rate": 1.6132782557158245e-05, + "loss": 0.5812, + "step": 4011 + }, + { + "epoch": 0.31, + "grad_norm": 1.1189564391888203, + "learning_rate": 1.6130797687766335e-05, + "loss": 0.544, + "step": 4012 + }, + { + "epoch": 0.31, + "grad_norm": 1.1246021944951656, + "learning_rate": 1.6128812431305534e-05, + "loss": 0.5812, + "step": 4013 + }, + { + "epoch": 0.31, + "grad_norm": 1.175541647390319, + "learning_rate": 1.6126826787901185e-05, + "loss": 0.6081, + "step": 4014 + }, + { + "epoch": 0.31, + "grad_norm": 1.1366678337937643, + "learning_rate": 1.6124840757678653e-05, + "loss": 0.6106, + "step": 4015 + }, + { + "epoch": 0.31, + "grad_norm": 1.1244202815591704, + "learning_rate": 1.612285434076332e-05, + "loss": 0.5615, + "step": 4016 + }, + { + "epoch": 0.31, + "grad_norm": 1.2184845072779211, + "learning_rate": 1.6120867537280608e-05, + "loss": 0.6, + "step": 4017 + }, + { + "epoch": 0.31, + "grad_norm": 1.1816462240795806, + "learning_rate": 1.611888034735594e-05, + "loss": 0.6147, + "step": 4018 + }, + { + "epoch": 0.31, + "grad_norm": 1.2339105818886065, + "learning_rate": 1.6116892771114793e-05, + "loss": 0.633, + "step": 4019 + }, + { + "epoch": 0.31, + "grad_norm": 1.0915302094260146, + "learning_rate": 1.6114904808682644e-05, + "loss": 0.5481, + "step": 4020 + }, + { + "epoch": 0.31, + "grad_norm": 1.1885559506885552, + "learning_rate": 1.6112916460185008e-05, + "loss": 0.6277, + "step": 4021 + }, + { + "epoch": 0.31, + "grad_norm": 1.2741424855810366, + "learning_rate": 1.6110927725747413e-05, + "loss": 0.6197, + "step": 4022 + }, + { + "epoch": 0.31, + "grad_norm": 1.1242694602064187, + "learning_rate": 1.610893860549543e-05, + "loss": 0.5296, + "step": 4023 + }, + { + "epoch": 0.31, + "grad_norm": 1.1756699214710347, + "learning_rate": 1.610694909955463e-05, + "loss": 0.5201, + "step": 4024 + }, + { + "epoch": 0.31, + "grad_norm": 1.2186917022064503, + "learning_rate": 1.6104959208050625e-05, + "loss": 0.6423, + "step": 4025 + }, + { + "epoch": 0.31, + "grad_norm": 1.1961654412370195, + "learning_rate": 1.610296893110905e-05, + "loss": 0.586, + "step": 4026 + }, + { + "epoch": 0.31, + "grad_norm": 1.1615095369601849, + "learning_rate": 1.6100978268855552e-05, + "loss": 0.5639, + "step": 4027 + }, + { + "epoch": 0.31, + "grad_norm": 1.2014112280579905, + "learning_rate": 1.609898722141583e-05, + "loss": 0.6249, + "step": 4028 + }, + { + "epoch": 0.31, + "grad_norm": 1.1315937194478678, + "learning_rate": 1.609699578891557e-05, + "loss": 0.5543, + "step": 4029 + }, + { + "epoch": 0.31, + "grad_norm": 1.3114084064089124, + "learning_rate": 1.6095003971480514e-05, + "loss": 0.6376, + "step": 4030 + }, + { + "epoch": 0.31, + "grad_norm": 1.265077448950562, + "learning_rate": 1.609301176923641e-05, + "loss": 0.6216, + "step": 4031 + }, + { + "epoch": 0.31, + "grad_norm": 1.1475924022928314, + "learning_rate": 1.6091019182309033e-05, + "loss": 0.5656, + "step": 4032 + }, + { + "epoch": 0.31, + "grad_norm": 1.1737722170903604, + "learning_rate": 1.6089026210824196e-05, + "loss": 0.6254, + "step": 4033 + }, + { + "epoch": 0.31, + "grad_norm": 1.1438404381021985, + "learning_rate": 1.6087032854907715e-05, + "loss": 0.5805, + "step": 4034 + }, + { + "epoch": 0.31, + "grad_norm": 1.0830969674895723, + "learning_rate": 1.6085039114685448e-05, + "loss": 0.5331, + "step": 4035 + }, + { + "epoch": 0.31, + "grad_norm": 1.1128636698004142, + "learning_rate": 1.6083044990283263e-05, + "loss": 0.5696, + "step": 4036 + }, + { + "epoch": 0.31, + "grad_norm": 1.194368126498595, + "learning_rate": 1.6081050481827066e-05, + "loss": 0.6, + "step": 4037 + }, + { + "epoch": 0.31, + "grad_norm": 1.1239262861197168, + "learning_rate": 1.6079055589442778e-05, + "loss": 0.6219, + "step": 4038 + }, + { + "epoch": 0.31, + "grad_norm": 1.2518657588379594, + "learning_rate": 1.6077060313256348e-05, + "loss": 0.6307, + "step": 4039 + }, + { + "epoch": 0.31, + "grad_norm": 1.090109953277923, + "learning_rate": 1.6075064653393748e-05, + "loss": 0.5383, + "step": 4040 + }, + { + "epoch": 0.31, + "grad_norm": 1.2473465412676827, + "learning_rate": 1.6073068609980968e-05, + "loss": 0.5926, + "step": 4041 + }, + { + "epoch": 0.31, + "grad_norm": 1.3225680615338957, + "learning_rate": 1.6071072183144033e-05, + "loss": 0.5962, + "step": 4042 + }, + { + "epoch": 0.31, + "grad_norm": 1.2329081257718428, + "learning_rate": 1.6069075373008997e-05, + "loss": 0.597, + "step": 4043 + }, + { + "epoch": 0.31, + "grad_norm": 1.1676300986786838, + "learning_rate": 1.6067078179701913e-05, + "loss": 0.5808, + "step": 4044 + }, + { + "epoch": 0.31, + "grad_norm": 1.1526814628616309, + "learning_rate": 1.6065080603348885e-05, + "loss": 0.6044, + "step": 4045 + }, + { + "epoch": 0.31, + "grad_norm": 1.2083597125539236, + "learning_rate": 1.6063082644076026e-05, + "loss": 0.6071, + "step": 4046 + }, + { + "epoch": 0.31, + "grad_norm": 1.2155312313208602, + "learning_rate": 1.6061084302009473e-05, + "loss": 0.6456, + "step": 4047 + }, + { + "epoch": 0.31, + "grad_norm": 1.1809232189145735, + "learning_rate": 1.6059085577275402e-05, + "loss": 0.6001, + "step": 4048 + }, + { + "epoch": 0.31, + "grad_norm": 1.3228249655752855, + "learning_rate": 1.605708647e-05, + "loss": 0.5856, + "step": 4049 + }, + { + "epoch": 0.31, + "grad_norm": 1.174252146273254, + "learning_rate": 1.6055086980309475e-05, + "loss": 0.6324, + "step": 4050 + }, + { + "epoch": 0.31, + "grad_norm": 1.333198326944949, + "learning_rate": 1.605308710833007e-05, + "loss": 0.5949, + "step": 4051 + }, + { + "epoch": 0.31, + "grad_norm": 1.1584392647823685, + "learning_rate": 1.6051086854188046e-05, + "loss": 0.5239, + "step": 4052 + }, + { + "epoch": 0.31, + "grad_norm": 1.042404193455187, + "learning_rate": 1.604908621800969e-05, + "loss": 0.5189, + "step": 4053 + }, + { + "epoch": 0.31, + "grad_norm": 1.1160018604939856, + "learning_rate": 1.604708519992131e-05, + "loss": 0.5914, + "step": 4054 + }, + { + "epoch": 0.31, + "grad_norm": 1.246749179390805, + "learning_rate": 1.6045083800049243e-05, + "loss": 0.5528, + "step": 4055 + }, + { + "epoch": 0.31, + "grad_norm": 1.3248094349702548, + "learning_rate": 1.604308201851985e-05, + "loss": 0.638, + "step": 4056 + }, + { + "epoch": 0.31, + "grad_norm": 1.1998209978746488, + "learning_rate": 1.6041079855459506e-05, + "loss": 0.6108, + "step": 4057 + }, + { + "epoch": 0.31, + "grad_norm": 1.2605346225158425, + "learning_rate": 1.603907731099463e-05, + "loss": 0.5983, + "step": 4058 + }, + { + "epoch": 0.31, + "grad_norm": 1.2750747079092335, + "learning_rate": 1.603707438525164e-05, + "loss": 0.6677, + "step": 4059 + }, + { + "epoch": 0.31, + "grad_norm": 1.2016606522690463, + "learning_rate": 1.6035071078356998e-05, + "loss": 0.5742, + "step": 4060 + }, + { + "epoch": 0.32, + "grad_norm": 1.1839374509025111, + "learning_rate": 1.603306739043718e-05, + "loss": 0.5602, + "step": 4061 + }, + { + "epoch": 0.32, + "grad_norm": 1.1035169213211413, + "learning_rate": 1.6031063321618697e-05, + "loss": 0.5843, + "step": 4062 + }, + { + "epoch": 0.32, + "grad_norm": 1.200744999652961, + "learning_rate": 1.6029058872028063e-05, + "loss": 0.6201, + "step": 4063 + }, + { + "epoch": 0.32, + "grad_norm": 1.152203101933758, + "learning_rate": 1.6027054041791842e-05, + "loss": 0.6164, + "step": 4064 + }, + { + "epoch": 0.32, + "grad_norm": 1.256626826083971, + "learning_rate": 1.6025048831036603e-05, + "loss": 0.638, + "step": 4065 + }, + { + "epoch": 0.32, + "grad_norm": 1.1813671457644892, + "learning_rate": 1.6023043239888946e-05, + "loss": 0.6082, + "step": 4066 + }, + { + "epoch": 0.32, + "grad_norm": 1.2103966274123723, + "learning_rate": 1.602103726847549e-05, + "loss": 0.5989, + "step": 4067 + }, + { + "epoch": 0.32, + "grad_norm": 1.2175937939181174, + "learning_rate": 1.6019030916922892e-05, + "loss": 0.6199, + "step": 4068 + }, + { + "epoch": 0.32, + "grad_norm": 1.1165489025553181, + "learning_rate": 1.6017024185357817e-05, + "loss": 0.5692, + "step": 4069 + }, + { + "epoch": 0.32, + "grad_norm": 1.250109190940164, + "learning_rate": 1.601501707390696e-05, + "loss": 0.5884, + "step": 4070 + }, + { + "epoch": 0.32, + "grad_norm": 1.3070057813462548, + "learning_rate": 1.6013009582697043e-05, + "loss": 0.6139, + "step": 4071 + }, + { + "epoch": 0.32, + "grad_norm": 1.2741008037379273, + "learning_rate": 1.601100171185481e-05, + "loss": 0.6513, + "step": 4072 + }, + { + "epoch": 0.32, + "grad_norm": 1.0050610383320062, + "learning_rate": 1.6008993461507022e-05, + "loss": 0.4993, + "step": 4073 + }, + { + "epoch": 0.32, + "grad_norm": 1.1282934089558068, + "learning_rate": 1.6006984831780476e-05, + "loss": 0.5376, + "step": 4074 + }, + { + "epoch": 0.32, + "grad_norm": 1.2381305783419083, + "learning_rate": 1.6004975822801986e-05, + "loss": 0.6253, + "step": 4075 + }, + { + "epoch": 0.32, + "grad_norm": 1.2529711697877712, + "learning_rate": 1.600296643469839e-05, + "loss": 0.6211, + "step": 4076 + }, + { + "epoch": 0.32, + "grad_norm": 1.146491861400174, + "learning_rate": 1.6000956667596554e-05, + "loss": 0.5507, + "step": 4077 + }, + { + "epoch": 0.32, + "grad_norm": 1.0868085482674297, + "learning_rate": 1.599894652162336e-05, + "loss": 0.557, + "step": 4078 + }, + { + "epoch": 0.32, + "grad_norm": 1.2270822577911655, + "learning_rate": 1.5996935996905722e-05, + "loss": 0.5649, + "step": 4079 + }, + { + "epoch": 0.32, + "grad_norm": 1.1749864597757853, + "learning_rate": 1.5994925093570578e-05, + "loss": 0.6118, + "step": 4080 + }, + { + "epoch": 0.32, + "grad_norm": 1.2632427164876006, + "learning_rate": 1.599291381174488e-05, + "loss": 0.5955, + "step": 4081 + }, + { + "epoch": 0.32, + "grad_norm": 1.183447246486196, + "learning_rate": 1.5990902151555612e-05, + "loss": 0.5786, + "step": 4082 + }, + { + "epoch": 0.32, + "grad_norm": 1.124024498058253, + "learning_rate": 1.5988890113129786e-05, + "loss": 0.5757, + "step": 4083 + }, + { + "epoch": 0.32, + "grad_norm": 1.1925538586303988, + "learning_rate": 1.5986877696594425e-05, + "loss": 0.6069, + "step": 4084 + }, + { + "epoch": 0.32, + "grad_norm": 1.2013483181511755, + "learning_rate": 1.598486490207659e-05, + "loss": 0.6026, + "step": 4085 + }, + { + "epoch": 0.32, + "grad_norm": 1.0765093431789332, + "learning_rate": 1.598285172970335e-05, + "loss": 0.5528, + "step": 4086 + }, + { + "epoch": 0.32, + "grad_norm": 1.2788229027901965, + "learning_rate": 1.598083817960182e-05, + "loss": 0.6258, + "step": 4087 + }, + { + "epoch": 0.32, + "grad_norm": 1.3049319831983297, + "learning_rate": 1.5978824251899117e-05, + "loss": 0.6535, + "step": 4088 + }, + { + "epoch": 0.32, + "grad_norm": 1.2464490043286351, + "learning_rate": 1.597680994672239e-05, + "loss": 0.6026, + "step": 4089 + }, + { + "epoch": 0.32, + "grad_norm": 1.1429909506445404, + "learning_rate": 1.5974795264198814e-05, + "loss": 0.6242, + "step": 4090 + }, + { + "epoch": 0.32, + "grad_norm": 1.2879980605359609, + "learning_rate": 1.5972780204455592e-05, + "loss": 0.6501, + "step": 4091 + }, + { + "epoch": 0.32, + "grad_norm": 1.1755463121475838, + "learning_rate": 1.5970764767619933e-05, + "loss": 0.5684, + "step": 4092 + }, + { + "epoch": 0.32, + "grad_norm": 1.196094880206311, + "learning_rate": 1.5968748953819095e-05, + "loss": 0.6099, + "step": 4093 + }, + { + "epoch": 0.32, + "grad_norm": 1.2817598584493335, + "learning_rate": 1.596673276318034e-05, + "loss": 0.5943, + "step": 4094 + }, + { + "epoch": 0.32, + "grad_norm": 1.2085586809077369, + "learning_rate": 1.5964716195830958e-05, + "loss": 0.6354, + "step": 4095 + }, + { + "epoch": 0.32, + "grad_norm": 1.35159720668442, + "learning_rate": 1.5962699251898274e-05, + "loss": 0.6055, + "step": 4096 + }, + { + "epoch": 0.32, + "grad_norm": 1.133824705209295, + "learning_rate": 1.596068193150962e-05, + "loss": 0.5596, + "step": 4097 + }, + { + "epoch": 0.32, + "grad_norm": 1.1371854368119316, + "learning_rate": 1.595866423479236e-05, + "loss": 0.5844, + "step": 4098 + }, + { + "epoch": 0.32, + "grad_norm": 1.1801612357528344, + "learning_rate": 1.595664616187389e-05, + "loss": 0.5828, + "step": 4099 + }, + { + "epoch": 0.32, + "grad_norm": 1.2468891056955147, + "learning_rate": 1.595462771288161e-05, + "loss": 0.6108, + "step": 4100 + }, + { + "epoch": 0.32, + "grad_norm": 1.2033901170205261, + "learning_rate": 1.5952608887942967e-05, + "loss": 0.6022, + "step": 4101 + }, + { + "epoch": 0.32, + "grad_norm": 1.2634317679962785, + "learning_rate": 1.5950589687185405e-05, + "loss": 0.6315, + "step": 4102 + }, + { + "epoch": 0.32, + "grad_norm": 1.168779021938064, + "learning_rate": 1.594857011073642e-05, + "loss": 0.56, + "step": 4103 + }, + { + "epoch": 0.32, + "grad_norm": 1.0974620497969982, + "learning_rate": 1.5946550158723516e-05, + "loss": 0.5621, + "step": 4104 + }, + { + "epoch": 0.32, + "grad_norm": 1.0945140349881082, + "learning_rate": 1.5944529831274213e-05, + "loss": 0.5609, + "step": 4105 + }, + { + "epoch": 0.32, + "grad_norm": 1.1688990635594247, + "learning_rate": 1.5942509128516077e-05, + "loss": 0.5748, + "step": 4106 + }, + { + "epoch": 0.32, + "grad_norm": 1.2062149003214337, + "learning_rate": 1.594048805057668e-05, + "loss": 0.6537, + "step": 4107 + }, + { + "epoch": 0.32, + "grad_norm": 1.290696605330871, + "learning_rate": 1.5938466597583625e-05, + "loss": 0.6344, + "step": 4108 + }, + { + "epoch": 0.32, + "grad_norm": 1.2284935492880682, + "learning_rate": 1.5936444769664533e-05, + "loss": 0.5891, + "step": 4109 + }, + { + "epoch": 0.32, + "grad_norm": 1.2105269197466848, + "learning_rate": 1.593442256694705e-05, + "loss": 0.631, + "step": 4110 + }, + { + "epoch": 0.32, + "grad_norm": 1.196250198667185, + "learning_rate": 1.593239998955886e-05, + "loss": 0.593, + "step": 4111 + }, + { + "epoch": 0.32, + "grad_norm": 1.285429487777566, + "learning_rate": 1.593037703762765e-05, + "loss": 0.6019, + "step": 4112 + }, + { + "epoch": 0.32, + "grad_norm": 1.222874533573527, + "learning_rate": 1.5928353711281138e-05, + "loss": 0.5621, + "step": 4113 + }, + { + "epoch": 0.32, + "grad_norm": 1.1762218473594288, + "learning_rate": 1.5926330010647074e-05, + "loss": 0.6045, + "step": 4114 + }, + { + "epoch": 0.32, + "grad_norm": 1.0863788174540303, + "learning_rate": 1.5924305935853218e-05, + "loss": 0.5308, + "step": 4115 + }, + { + "epoch": 0.32, + "grad_norm": 1.2315550837648002, + "learning_rate": 1.5922281487027363e-05, + "loss": 0.6585, + "step": 4116 + }, + { + "epoch": 0.32, + "grad_norm": 1.156076057340976, + "learning_rate": 1.5920256664297326e-05, + "loss": 0.6323, + "step": 4117 + }, + { + "epoch": 0.32, + "grad_norm": 1.0992176720153686, + "learning_rate": 1.5918231467790938e-05, + "loss": 0.576, + "step": 4118 + }, + { + "epoch": 0.32, + "grad_norm": 1.254389399439678, + "learning_rate": 1.5916205897636063e-05, + "loss": 0.6041, + "step": 4119 + }, + { + "epoch": 0.32, + "grad_norm": 1.159064758825171, + "learning_rate": 1.5914179953960584e-05, + "loss": 0.6353, + "step": 4120 + }, + { + "epoch": 0.32, + "grad_norm": 1.172107927697421, + "learning_rate": 1.5912153636892416e-05, + "loss": 0.5914, + "step": 4121 + }, + { + "epoch": 0.32, + "grad_norm": 1.1033789085836685, + "learning_rate": 1.5910126946559484e-05, + "loss": 0.5793, + "step": 4122 + }, + { + "epoch": 0.32, + "grad_norm": 1.1070690097439773, + "learning_rate": 1.5908099883089746e-05, + "loss": 0.5886, + "step": 4123 + }, + { + "epoch": 0.32, + "grad_norm": 1.2302395077253894, + "learning_rate": 1.590607244661118e-05, + "loss": 0.5626, + "step": 4124 + }, + { + "epoch": 0.32, + "grad_norm": 1.2806528607265886, + "learning_rate": 1.5904044637251793e-05, + "loss": 0.6355, + "step": 4125 + }, + { + "epoch": 0.32, + "grad_norm": 1.1804876077333568, + "learning_rate": 1.5902016455139603e-05, + "loss": 0.6016, + "step": 4126 + }, + { + "epoch": 0.32, + "grad_norm": 1.1975104873452846, + "learning_rate": 1.589998790040266e-05, + "loss": 0.5445, + "step": 4127 + }, + { + "epoch": 0.32, + "grad_norm": 1.251928082243929, + "learning_rate": 1.589795897316905e-05, + "loss": 0.572, + "step": 4128 + }, + { + "epoch": 0.32, + "grad_norm": 1.20924101094676, + "learning_rate": 1.5895929673566858e-05, + "loss": 0.6552, + "step": 4129 + }, + { + "epoch": 0.32, + "grad_norm": 1.1965935019987255, + "learning_rate": 1.5893900001724204e-05, + "loss": 0.5841, + "step": 4130 + }, + { + "epoch": 0.32, + "grad_norm": 1.2484053453642063, + "learning_rate": 1.5891869957769232e-05, + "loss": 0.6036, + "step": 4131 + }, + { + "epoch": 0.32, + "grad_norm": 1.2915145210161616, + "learning_rate": 1.5889839541830115e-05, + "loss": 0.5962, + "step": 4132 + }, + { + "epoch": 0.32, + "grad_norm": 1.182488104455969, + "learning_rate": 1.5887808754035037e-05, + "loss": 0.6002, + "step": 4133 + }, + { + "epoch": 0.32, + "grad_norm": 1.1066614194155773, + "learning_rate": 1.588577759451222e-05, + "loss": 0.5645, + "step": 4134 + }, + { + "epoch": 0.32, + "grad_norm": 1.3685779585342646, + "learning_rate": 1.5883746063389897e-05, + "loss": 0.5852, + "step": 4135 + }, + { + "epoch": 0.32, + "grad_norm": 1.19871947988535, + "learning_rate": 1.5881714160796326e-05, + "loss": 0.6218, + "step": 4136 + }, + { + "epoch": 0.32, + "grad_norm": 1.2420773727255203, + "learning_rate": 1.5879681886859794e-05, + "loss": 0.6229, + "step": 4137 + }, + { + "epoch": 0.32, + "grad_norm": 1.2308886099309224, + "learning_rate": 1.5877649241708613e-05, + "loss": 0.5782, + "step": 4138 + }, + { + "epoch": 0.32, + "grad_norm": 1.1947508845177341, + "learning_rate": 1.5875616225471105e-05, + "loss": 0.5959, + "step": 4139 + }, + { + "epoch": 0.32, + "grad_norm": 1.1290108919319388, + "learning_rate": 1.5873582838275637e-05, + "loss": 0.5318, + "step": 4140 + }, + { + "epoch": 0.32, + "grad_norm": 1.1401085925666197, + "learning_rate": 1.5871549080250577e-05, + "loss": 0.5707, + "step": 4141 + }, + { + "epoch": 0.32, + "grad_norm": 1.1563511881497714, + "learning_rate": 1.586951495152433e-05, + "loss": 0.5712, + "step": 4142 + }, + { + "epoch": 0.32, + "grad_norm": 1.203907947180144, + "learning_rate": 1.5867480452225323e-05, + "loss": 0.6535, + "step": 4143 + }, + { + "epoch": 0.32, + "grad_norm": 1.2168094150637931, + "learning_rate": 1.5865445582482002e-05, + "loss": 0.5632, + "step": 4144 + }, + { + "epoch": 0.32, + "grad_norm": 1.2416731051250345, + "learning_rate": 1.586341034242284e-05, + "loss": 0.6188, + "step": 4145 + }, + { + "epoch": 0.32, + "grad_norm": 1.2532720655488314, + "learning_rate": 1.5861374732176332e-05, + "loss": 0.6122, + "step": 4146 + }, + { + "epoch": 0.32, + "grad_norm": 1.2004172831441842, + "learning_rate": 1.5859338751870998e-05, + "loss": 0.6108, + "step": 4147 + }, + { + "epoch": 0.32, + "grad_norm": 1.0671746695823765, + "learning_rate": 1.5857302401635373e-05, + "loss": 0.5586, + "step": 4148 + }, + { + "epoch": 0.32, + "grad_norm": 1.0990102433273106, + "learning_rate": 1.5855265681598032e-05, + "loss": 0.5914, + "step": 4149 + }, + { + "epoch": 0.32, + "grad_norm": 1.32162428478069, + "learning_rate": 1.585322859188756e-05, + "loss": 0.5657, + "step": 4150 + }, + { + "epoch": 0.32, + "grad_norm": 1.1037112442418504, + "learning_rate": 1.5851191132632563e-05, + "loss": 0.591, + "step": 4151 + }, + { + "epoch": 0.32, + "grad_norm": 1.2173754937429158, + "learning_rate": 1.584915330396169e-05, + "loss": 0.6426, + "step": 4152 + }, + { + "epoch": 0.32, + "grad_norm": 1.138650203649102, + "learning_rate": 1.5847115106003585e-05, + "loss": 0.5402, + "step": 4153 + }, + { + "epoch": 0.32, + "grad_norm": 1.2264169497295956, + "learning_rate": 1.5845076538886934e-05, + "loss": 0.5583, + "step": 4154 + }, + { + "epoch": 0.32, + "grad_norm": 1.2103888960961207, + "learning_rate": 1.584303760274045e-05, + "loss": 0.5968, + "step": 4155 + }, + { + "epoch": 0.32, + "grad_norm": 1.100631374446816, + "learning_rate": 1.5840998297692854e-05, + "loss": 0.5782, + "step": 4156 + }, + { + "epoch": 0.32, + "grad_norm": 1.3140736637253614, + "learning_rate": 1.5838958623872902e-05, + "loss": 0.582, + "step": 4157 + }, + { + "epoch": 0.32, + "grad_norm": 1.2300131302419608, + "learning_rate": 1.5836918581409365e-05, + "loss": 0.558, + "step": 4158 + }, + { + "epoch": 0.32, + "grad_norm": 1.1115516478035843, + "learning_rate": 1.583487817043104e-05, + "loss": 0.5221, + "step": 4159 + }, + { + "epoch": 0.32, + "grad_norm": 1.1952403925669344, + "learning_rate": 1.583283739106676e-05, + "loss": 0.6187, + "step": 4160 + }, + { + "epoch": 0.32, + "grad_norm": 1.197104911534478, + "learning_rate": 1.5830796243445357e-05, + "loss": 0.5644, + "step": 4161 + }, + { + "epoch": 0.32, + "grad_norm": 1.0958404726924815, + "learning_rate": 1.5828754727695703e-05, + "loss": 0.5338, + "step": 4162 + }, + { + "epoch": 0.32, + "grad_norm": 1.1056213273612223, + "learning_rate": 1.5826712843946693e-05, + "loss": 0.5319, + "step": 4163 + }, + { + "epoch": 0.32, + "grad_norm": 1.1494895113001469, + "learning_rate": 1.582467059232724e-05, + "loss": 0.5539, + "step": 4164 + }, + { + "epoch": 0.32, + "grad_norm": 1.1224026897365564, + "learning_rate": 1.582262797296628e-05, + "loss": 0.5406, + "step": 4165 + }, + { + "epoch": 0.32, + "grad_norm": 1.29688478948736, + "learning_rate": 1.5820584985992777e-05, + "loss": 0.5881, + "step": 4166 + }, + { + "epoch": 0.32, + "grad_norm": 1.1768997803204808, + "learning_rate": 1.581854163153571e-05, + "loss": 0.5997, + "step": 4167 + }, + { + "epoch": 0.32, + "grad_norm": 1.1589427214087622, + "learning_rate": 1.581649790972409e-05, + "loss": 0.5597, + "step": 4168 + }, + { + "epoch": 0.32, + "grad_norm": 1.134272456723087, + "learning_rate": 1.581445382068695e-05, + "loss": 0.561, + "step": 4169 + }, + { + "epoch": 0.32, + "grad_norm": 1.1222380536812362, + "learning_rate": 1.5812409364553344e-05, + "loss": 0.6166, + "step": 4170 + }, + { + "epoch": 0.32, + "grad_norm": 1.2356243329970122, + "learning_rate": 1.5810364541452342e-05, + "loss": 0.6328, + "step": 4171 + }, + { + "epoch": 0.32, + "grad_norm": 1.1618939881314763, + "learning_rate": 1.580831935151305e-05, + "loss": 0.6192, + "step": 4172 + }, + { + "epoch": 0.32, + "grad_norm": 1.1489546253816612, + "learning_rate": 1.5806273794864592e-05, + "loss": 0.5828, + "step": 4173 + }, + { + "epoch": 0.32, + "grad_norm": 1.1931558738298869, + "learning_rate": 1.5804227871636114e-05, + "loss": 0.6067, + "step": 4174 + }, + { + "epoch": 0.32, + "grad_norm": 1.224700340287395, + "learning_rate": 1.5802181581956782e-05, + "loss": 0.6024, + "step": 4175 + }, + { + "epoch": 0.32, + "grad_norm": 1.197164658836633, + "learning_rate": 1.5800134925955792e-05, + "loss": 0.6414, + "step": 4176 + }, + { + "epoch": 0.32, + "grad_norm": 1.0887383217957185, + "learning_rate": 1.579808790376236e-05, + "loss": 0.6058, + "step": 4177 + }, + { + "epoch": 0.32, + "grad_norm": 1.259549996307871, + "learning_rate": 1.5796040515505724e-05, + "loss": 0.6651, + "step": 4178 + }, + { + "epoch": 0.32, + "grad_norm": 1.0770380373394608, + "learning_rate": 1.5793992761315147e-05, + "loss": 0.5502, + "step": 4179 + }, + { + "epoch": 0.32, + "grad_norm": 1.2953728398348576, + "learning_rate": 1.5791944641319914e-05, + "loss": 0.6397, + "step": 4180 + }, + { + "epoch": 0.32, + "grad_norm": 1.2637665372647529, + "learning_rate": 1.5789896155649333e-05, + "loss": 0.5922, + "step": 4181 + }, + { + "epoch": 0.32, + "grad_norm": 1.2070424594790734, + "learning_rate": 1.578784730443273e-05, + "loss": 0.5758, + "step": 4182 + }, + { + "epoch": 0.32, + "grad_norm": 1.290383557880392, + "learning_rate": 1.5785798087799476e-05, + "loss": 0.5818, + "step": 4183 + }, + { + "epoch": 0.32, + "grad_norm": 1.2466818160311268, + "learning_rate": 1.5783748505878932e-05, + "loss": 0.6033, + "step": 4184 + }, + { + "epoch": 0.32, + "grad_norm": 1.371210468247492, + "learning_rate": 1.5781698558800503e-05, + "loss": 0.6262, + "step": 4185 + }, + { + "epoch": 0.32, + "grad_norm": 1.185993694838671, + "learning_rate": 1.577964824669362e-05, + "loss": 0.6076, + "step": 4186 + }, + { + "epoch": 0.32, + "grad_norm": 1.190057510140332, + "learning_rate": 1.577759756968772e-05, + "loss": 0.5602, + "step": 4187 + }, + { + "epoch": 0.32, + "grad_norm": 1.1809004554076394, + "learning_rate": 1.577554652791228e-05, + "loss": 0.5978, + "step": 4188 + }, + { + "epoch": 0.32, + "grad_norm": 1.238725889261915, + "learning_rate": 1.5773495121496787e-05, + "loss": 0.5387, + "step": 4189 + }, + { + "epoch": 0.33, + "grad_norm": 1.12641822642419, + "learning_rate": 1.5771443350570756e-05, + "loss": 0.6105, + "step": 4190 + }, + { + "epoch": 0.33, + "grad_norm": 1.2080687474368461, + "learning_rate": 1.5769391215263737e-05, + "loss": 0.5477, + "step": 4191 + }, + { + "epoch": 0.33, + "grad_norm": 1.189125554622287, + "learning_rate": 1.576733871570528e-05, + "loss": 0.6163, + "step": 4192 + }, + { + "epoch": 0.33, + "grad_norm": 1.1753070166413515, + "learning_rate": 1.576528585202498e-05, + "loss": 0.6237, + "step": 4193 + }, + { + "epoch": 0.33, + "grad_norm": 1.2476602591281176, + "learning_rate": 1.576323262435243e-05, + "loss": 0.5966, + "step": 4194 + }, + { + "epoch": 0.33, + "grad_norm": 1.1575786071187284, + "learning_rate": 1.5761179032817275e-05, + "loss": 0.56, + "step": 4195 + }, + { + "epoch": 0.33, + "grad_norm": 1.247796787284363, + "learning_rate": 1.5759125077549164e-05, + "loss": 0.5504, + "step": 4196 + }, + { + "epoch": 0.33, + "grad_norm": 1.2886518344436713, + "learning_rate": 1.5757070758677775e-05, + "loss": 0.6028, + "step": 4197 + }, + { + "epoch": 0.33, + "grad_norm": 1.1983164521002307, + "learning_rate": 1.5755016076332805e-05, + "loss": 0.5647, + "step": 4198 + }, + { + "epoch": 0.33, + "grad_norm": 1.1704388783240482, + "learning_rate": 1.5752961030643978e-05, + "loss": 0.5942, + "step": 4199 + }, + { + "epoch": 0.33, + "grad_norm": 1.1912501999013279, + "learning_rate": 1.5750905621741037e-05, + "loss": 0.6125, + "step": 4200 + }, + { + "epoch": 0.33, + "grad_norm": 1.135771156525432, + "learning_rate": 1.5748849849753757e-05, + "loss": 0.5413, + "step": 4201 + }, + { + "epoch": 0.33, + "grad_norm": 1.084404226563342, + "learning_rate": 1.5746793714811925e-05, + "loss": 0.5594, + "step": 4202 + }, + { + "epoch": 0.33, + "grad_norm": 1.177701124035044, + "learning_rate": 1.5744737217045355e-05, + "loss": 0.5573, + "step": 4203 + }, + { + "epoch": 0.33, + "grad_norm": 1.1667078669403046, + "learning_rate": 1.574268035658389e-05, + "loss": 0.5941, + "step": 4204 + }, + { + "epoch": 0.33, + "grad_norm": 1.3080694714100287, + "learning_rate": 1.574062313355738e-05, + "loss": 0.6122, + "step": 4205 + }, + { + "epoch": 0.33, + "grad_norm": 1.1308704434974841, + "learning_rate": 1.5738565548095718e-05, + "loss": 0.5881, + "step": 4206 + }, + { + "epoch": 0.33, + "grad_norm": 1.2058847160877186, + "learning_rate": 1.5736507600328804e-05, + "loss": 0.6081, + "step": 4207 + }, + { + "epoch": 0.33, + "grad_norm": 1.3394470072306475, + "learning_rate": 1.5734449290386568e-05, + "loss": 0.6675, + "step": 4208 + }, + { + "epoch": 0.33, + "grad_norm": 1.3144116331356501, + "learning_rate": 1.5732390618398966e-05, + "loss": 0.6165, + "step": 4209 + }, + { + "epoch": 0.33, + "grad_norm": 1.1398986698591642, + "learning_rate": 1.5730331584495965e-05, + "loss": 0.6137, + "step": 4210 + }, + { + "epoch": 0.33, + "grad_norm": 1.1406860335233333, + "learning_rate": 1.5728272188807564e-05, + "loss": 0.5765, + "step": 4211 + }, + { + "epoch": 0.33, + "grad_norm": 1.1929319025031162, + "learning_rate": 1.572621243146379e-05, + "loss": 0.5761, + "step": 4212 + }, + { + "epoch": 0.33, + "grad_norm": 1.2316240004056611, + "learning_rate": 1.5724152312594683e-05, + "loss": 0.5776, + "step": 4213 + }, + { + "epoch": 0.33, + "grad_norm": 1.2885305983433712, + "learning_rate": 1.5722091832330307e-05, + "loss": 0.6087, + "step": 4214 + }, + { + "epoch": 0.33, + "grad_norm": 1.2260871014512607, + "learning_rate": 1.572003099080075e-05, + "loss": 0.5777, + "step": 4215 + }, + { + "epoch": 0.33, + "grad_norm": 1.0597741559392169, + "learning_rate": 1.5717969788136123e-05, + "loss": 0.5589, + "step": 4216 + }, + { + "epoch": 0.33, + "grad_norm": 1.2328517544954958, + "learning_rate": 1.571590822446657e-05, + "loss": 0.5962, + "step": 4217 + }, + { + "epoch": 0.33, + "grad_norm": 1.180643818242358, + "learning_rate": 1.571384629992223e-05, + "loss": 0.6047, + "step": 4218 + }, + { + "epoch": 0.33, + "grad_norm": 1.153029208869553, + "learning_rate": 1.57117840146333e-05, + "loss": 0.6125, + "step": 4219 + }, + { + "epoch": 0.33, + "grad_norm": 1.122415912681638, + "learning_rate": 1.5709721368729977e-05, + "loss": 0.5619, + "step": 4220 + }, + { + "epoch": 0.33, + "grad_norm": 1.2249023963035894, + "learning_rate": 1.5707658362342484e-05, + "loss": 0.6708, + "step": 4221 + }, + { + "epoch": 0.33, + "grad_norm": 1.343392124869427, + "learning_rate": 1.570559499560107e-05, + "loss": 0.5913, + "step": 4222 + }, + { + "epoch": 0.33, + "grad_norm": 1.2005444880849245, + "learning_rate": 1.570353126863601e-05, + "loss": 0.6078, + "step": 4223 + }, + { + "epoch": 0.33, + "grad_norm": 1.1056571233814998, + "learning_rate": 1.570146718157759e-05, + "loss": 0.5583, + "step": 4224 + }, + { + "epoch": 0.33, + "grad_norm": 1.1242107165740816, + "learning_rate": 1.5699402734556133e-05, + "loss": 0.5937, + "step": 4225 + }, + { + "epoch": 0.33, + "grad_norm": 1.1543452580311813, + "learning_rate": 1.5697337927701977e-05, + "loss": 0.5595, + "step": 4226 + }, + { + "epoch": 0.33, + "grad_norm": 1.267246008861621, + "learning_rate": 1.5695272761145486e-05, + "loss": 0.6382, + "step": 4227 + }, + { + "epoch": 0.33, + "grad_norm": 1.1132376143620815, + "learning_rate": 1.569320723501704e-05, + "loss": 0.587, + "step": 4228 + }, + { + "epoch": 0.33, + "grad_norm": 1.164809838357185, + "learning_rate": 1.5691141349447046e-05, + "loss": 0.5429, + "step": 4229 + }, + { + "epoch": 0.33, + "grad_norm": 1.2153910296516017, + "learning_rate": 1.5689075104565936e-05, + "loss": 0.6058, + "step": 4230 + }, + { + "epoch": 0.33, + "grad_norm": 1.1422446783237805, + "learning_rate": 1.5687008500504165e-05, + "loss": 0.6221, + "step": 4231 + }, + { + "epoch": 0.33, + "grad_norm": 1.1590503597986488, + "learning_rate": 1.5684941537392205e-05, + "loss": 0.6127, + "step": 4232 + }, + { + "epoch": 0.33, + "grad_norm": 1.2456341795863013, + "learning_rate": 1.5682874215360557e-05, + "loss": 0.5934, + "step": 4233 + }, + { + "epoch": 0.33, + "grad_norm": 1.2216283115040354, + "learning_rate": 1.568080653453974e-05, + "loss": 0.5776, + "step": 4234 + }, + { + "epoch": 0.33, + "grad_norm": 1.0893601195587912, + "learning_rate": 1.5678738495060292e-05, + "loss": 0.5819, + "step": 4235 + }, + { + "epoch": 0.33, + "grad_norm": 1.0774837743152603, + "learning_rate": 1.567667009705279e-05, + "loss": 0.509, + "step": 4236 + }, + { + "epoch": 0.33, + "grad_norm": 1.0842760947906658, + "learning_rate": 1.567460134064782e-05, + "loss": 0.5292, + "step": 4237 + }, + { + "epoch": 0.33, + "grad_norm": 1.1248018832071582, + "learning_rate": 1.5672532225975983e-05, + "loss": 0.5691, + "step": 4238 + }, + { + "epoch": 0.33, + "grad_norm": 1.189464800797708, + "learning_rate": 1.5670462753167922e-05, + "loss": 0.5998, + "step": 4239 + }, + { + "epoch": 0.33, + "grad_norm": 1.3506471036684864, + "learning_rate": 1.566839292235429e-05, + "loss": 0.6264, + "step": 4240 + }, + { + "epoch": 0.33, + "grad_norm": 1.1231247319309805, + "learning_rate": 1.5666322733665773e-05, + "loss": 0.6178, + "step": 4241 + }, + { + "epoch": 0.33, + "grad_norm": 1.1648143414086092, + "learning_rate": 1.5664252187233066e-05, + "loss": 0.5785, + "step": 4242 + }, + { + "epoch": 0.33, + "grad_norm": 1.2548386859185994, + "learning_rate": 1.5662181283186894e-05, + "loss": 0.6065, + "step": 4243 + }, + { + "epoch": 0.33, + "grad_norm": 1.2017569752874238, + "learning_rate": 1.5660110021658002e-05, + "loss": 0.5535, + "step": 4244 + }, + { + "epoch": 0.33, + "grad_norm": 1.1170711323347944, + "learning_rate": 1.5658038402777165e-05, + "loss": 0.5489, + "step": 4245 + }, + { + "epoch": 0.33, + "grad_norm": 1.1981973678551041, + "learning_rate": 1.5655966426675172e-05, + "loss": 0.6382, + "step": 4246 + }, + { + "epoch": 0.33, + "grad_norm": 1.2080716090830148, + "learning_rate": 1.5653894093482835e-05, + "loss": 0.5793, + "step": 4247 + }, + { + "epoch": 0.33, + "grad_norm": 1.1014751102061386, + "learning_rate": 1.5651821403330996e-05, + "loss": 0.5579, + "step": 4248 + }, + { + "epoch": 0.33, + "grad_norm": 1.1856763288279222, + "learning_rate": 1.5649748356350513e-05, + "loss": 0.6111, + "step": 4249 + }, + { + "epoch": 0.33, + "grad_norm": 1.111876608818996, + "learning_rate": 1.5647674952672265e-05, + "loss": 0.5396, + "step": 4250 + }, + { + "epoch": 0.33, + "grad_norm": 1.1950902794609333, + "learning_rate": 1.564560119242716e-05, + "loss": 0.681, + "step": 4251 + }, + { + "epoch": 0.33, + "grad_norm": 1.2280012306182362, + "learning_rate": 1.5643527075746125e-05, + "loss": 0.5947, + "step": 4252 + }, + { + "epoch": 0.33, + "grad_norm": 1.1791713166341768, + "learning_rate": 1.564145260276011e-05, + "loss": 0.5775, + "step": 4253 + }, + { + "epoch": 0.33, + "grad_norm": 1.2863204460019095, + "learning_rate": 1.563937777360008e-05, + "loss": 0.6008, + "step": 4254 + }, + { + "epoch": 0.33, + "grad_norm": 1.15050471052642, + "learning_rate": 1.563730258839704e-05, + "loss": 0.6132, + "step": 4255 + }, + { + "epoch": 0.33, + "grad_norm": 1.1993216305816103, + "learning_rate": 1.5635227047282005e-05, + "loss": 0.5771, + "step": 4256 + }, + { + "epoch": 0.33, + "grad_norm": 1.1615289344484334, + "learning_rate": 1.563315115038601e-05, + "loss": 0.561, + "step": 4257 + }, + { + "epoch": 0.33, + "grad_norm": 1.1735187954431585, + "learning_rate": 1.563107489784012e-05, + "loss": 0.5682, + "step": 4258 + }, + { + "epoch": 0.33, + "grad_norm": 1.2188851697253185, + "learning_rate": 1.562899828977542e-05, + "loss": 0.5923, + "step": 4259 + }, + { + "epoch": 0.33, + "grad_norm": 1.1471939100867332, + "learning_rate": 1.5626921326323016e-05, + "loss": 0.5777, + "step": 4260 + }, + { + "epoch": 0.33, + "grad_norm": 1.1356480330638674, + "learning_rate": 1.5624844007614037e-05, + "loss": 0.6542, + "step": 4261 + }, + { + "epoch": 0.33, + "grad_norm": 1.295458192074889, + "learning_rate": 1.5622766333779637e-05, + "loss": 0.6568, + "step": 4262 + }, + { + "epoch": 0.33, + "grad_norm": 1.1669861095062695, + "learning_rate": 1.5620688304950985e-05, + "loss": 0.6234, + "step": 4263 + }, + { + "epoch": 0.33, + "grad_norm": 1.186047317989056, + "learning_rate": 1.5618609921259286e-05, + "loss": 0.5964, + "step": 4264 + }, + { + "epoch": 0.33, + "grad_norm": 1.238075503997241, + "learning_rate": 1.561653118283575e-05, + "loss": 0.5791, + "step": 4265 + }, + { + "epoch": 0.33, + "grad_norm": 1.1897992912898259, + "learning_rate": 1.5614452089811628e-05, + "loss": 0.6058, + "step": 4266 + }, + { + "epoch": 0.33, + "grad_norm": 1.0708700245509777, + "learning_rate": 1.5612372642318176e-05, + "loss": 0.5759, + "step": 4267 + }, + { + "epoch": 0.33, + "grad_norm": 1.1604674978212233, + "learning_rate": 1.5610292840486684e-05, + "loss": 0.5637, + "step": 4268 + }, + { + "epoch": 0.33, + "grad_norm": 1.1496032714681725, + "learning_rate": 1.5608212684448453e-05, + "loss": 0.6144, + "step": 4269 + }, + { + "epoch": 0.33, + "grad_norm": 1.2419530299361183, + "learning_rate": 1.560613217433483e-05, + "loss": 0.6095, + "step": 4270 + }, + { + "epoch": 0.33, + "grad_norm": 1.2836035534791388, + "learning_rate": 1.5604051310277152e-05, + "loss": 0.6439, + "step": 4271 + }, + { + "epoch": 0.33, + "grad_norm": 1.1591084176371425, + "learning_rate": 1.5601970092406807e-05, + "loss": 0.5686, + "step": 4272 + }, + { + "epoch": 0.33, + "grad_norm": 1.13279150581637, + "learning_rate": 1.5599888520855186e-05, + "loss": 0.5598, + "step": 4273 + }, + { + "epoch": 0.33, + "grad_norm": 1.203480606040116, + "learning_rate": 1.559780659575371e-05, + "loss": 0.588, + "step": 4274 + }, + { + "epoch": 0.33, + "grad_norm": 1.130988816938756, + "learning_rate": 1.559572431723382e-05, + "loss": 0.5794, + "step": 4275 + }, + { + "epoch": 0.33, + "grad_norm": 1.2944949728027186, + "learning_rate": 1.559364168542699e-05, + "loss": 0.6655, + "step": 4276 + }, + { + "epoch": 0.33, + "grad_norm": 1.1272260047928933, + "learning_rate": 1.55915587004647e-05, + "loss": 0.5523, + "step": 4277 + }, + { + "epoch": 0.33, + "grad_norm": 1.2182135746373812, + "learning_rate": 1.5589475362478458e-05, + "loss": 0.5855, + "step": 4278 + }, + { + "epoch": 0.33, + "grad_norm": 1.3181940271744912, + "learning_rate": 1.5587391671599798e-05, + "loss": 0.6113, + "step": 4279 + }, + { + "epoch": 0.33, + "grad_norm": 1.2505427612686963, + "learning_rate": 1.5585307627960277e-05, + "loss": 0.5963, + "step": 4280 + }, + { + "epoch": 0.33, + "grad_norm": 1.0647500840075304, + "learning_rate": 1.5583223231691466e-05, + "loss": 0.5443, + "step": 4281 + }, + { + "epoch": 0.33, + "grad_norm": 1.0774165604007833, + "learning_rate": 1.558113848292497e-05, + "loss": 0.5454, + "step": 4282 + }, + { + "epoch": 0.33, + "grad_norm": 1.109701994938504, + "learning_rate": 1.5579053381792406e-05, + "loss": 0.5765, + "step": 4283 + }, + { + "epoch": 0.33, + "grad_norm": 1.1624594794153704, + "learning_rate": 1.5576967928425414e-05, + "loss": 0.5757, + "step": 4284 + }, + { + "epoch": 0.33, + "grad_norm": 1.281872388817533, + "learning_rate": 1.5574882122955668e-05, + "loss": 0.5887, + "step": 4285 + }, + { + "epoch": 0.33, + "grad_norm": 1.1957303482441077, + "learning_rate": 1.557279596551485e-05, + "loss": 0.5552, + "step": 4286 + }, + { + "epoch": 0.33, + "grad_norm": 1.1569548855346417, + "learning_rate": 1.557070945623467e-05, + "loss": 0.6048, + "step": 4287 + }, + { + "epoch": 0.33, + "grad_norm": 1.1554446121800024, + "learning_rate": 1.556862259524686e-05, + "loss": 0.5987, + "step": 4288 + }, + { + "epoch": 0.33, + "grad_norm": 1.0879999239409643, + "learning_rate": 1.556653538268318e-05, + "loss": 0.5325, + "step": 4289 + }, + { + "epoch": 0.33, + "grad_norm": 1.2413542249856198, + "learning_rate": 1.5564447818675397e-05, + "loss": 0.5897, + "step": 4290 + }, + { + "epoch": 0.33, + "grad_norm": 1.1876448492750868, + "learning_rate": 1.5562359903355315e-05, + "loss": 0.5331, + "step": 4291 + }, + { + "epoch": 0.33, + "grad_norm": 1.1003812584361574, + "learning_rate": 1.5560271636854757e-05, + "loss": 0.5632, + "step": 4292 + }, + { + "epoch": 0.33, + "grad_norm": 1.1683119457824958, + "learning_rate": 1.555818301930556e-05, + "loss": 0.5694, + "step": 4293 + }, + { + "epoch": 0.33, + "grad_norm": 1.112776524755778, + "learning_rate": 1.555609405083959e-05, + "loss": 0.5432, + "step": 4294 + }, + { + "epoch": 0.33, + "grad_norm": 1.3055725293825904, + "learning_rate": 1.5554004731588745e-05, + "loss": 0.6332, + "step": 4295 + }, + { + "epoch": 0.33, + "grad_norm": 1.1394203827357794, + "learning_rate": 1.555191506168492e-05, + "loss": 0.5719, + "step": 4296 + }, + { + "epoch": 0.33, + "grad_norm": 1.3082075317709538, + "learning_rate": 1.5549825041260052e-05, + "loss": 0.624, + "step": 4297 + }, + { + "epoch": 0.33, + "grad_norm": 1.341341388358419, + "learning_rate": 1.5547734670446103e-05, + "loss": 0.6568, + "step": 4298 + }, + { + "epoch": 0.33, + "grad_norm": 1.2121439184785832, + "learning_rate": 1.554564394937504e-05, + "loss": 0.6046, + "step": 4299 + }, + { + "epoch": 0.33, + "grad_norm": 1.3055558199014239, + "learning_rate": 1.5543552878178857e-05, + "loss": 0.6063, + "step": 4300 + }, + { + "epoch": 0.33, + "grad_norm": 1.2305411181285864, + "learning_rate": 1.5541461456989583e-05, + "loss": 0.649, + "step": 4301 + }, + { + "epoch": 0.33, + "grad_norm": 1.192278983229602, + "learning_rate": 1.553936968593926e-05, + "loss": 0.6031, + "step": 4302 + }, + { + "epoch": 0.33, + "grad_norm": 1.0950641366822178, + "learning_rate": 1.5537277565159944e-05, + "loss": 0.6649, + "step": 4303 + }, + { + "epoch": 0.33, + "grad_norm": 1.1334477715122804, + "learning_rate": 1.5535185094783728e-05, + "loss": 0.5492, + "step": 4304 + }, + { + "epoch": 0.33, + "grad_norm": 1.1961000129640387, + "learning_rate": 1.5533092274942724e-05, + "loss": 0.6009, + "step": 4305 + }, + { + "epoch": 0.33, + "grad_norm": 1.266119118767877, + "learning_rate": 1.553099910576905e-05, + "loss": 0.5972, + "step": 4306 + }, + { + "epoch": 0.33, + "grad_norm": 1.2080482223277254, + "learning_rate": 1.5528905587394872e-05, + "loss": 0.5804, + "step": 4307 + }, + { + "epoch": 0.33, + "grad_norm": 1.205405958892666, + "learning_rate": 1.5526811719952356e-05, + "loss": 0.6101, + "step": 4308 + }, + { + "epoch": 0.33, + "grad_norm": 1.1561293667935484, + "learning_rate": 1.55247175035737e-05, + "loss": 0.6169, + "step": 4309 + }, + { + "epoch": 0.33, + "grad_norm": 1.2087806443725497, + "learning_rate": 1.5522622938391132e-05, + "loss": 0.6186, + "step": 4310 + }, + { + "epoch": 0.33, + "grad_norm": 1.2466818160311268, + "learning_rate": 1.552052802453688e-05, + "loss": 0.6416, + "step": 4311 + }, + { + "epoch": 0.33, + "grad_norm": 1.2414732024933746, + "learning_rate": 1.551843276214321e-05, + "loss": 0.5562, + "step": 4312 + }, + { + "epoch": 0.33, + "grad_norm": 1.3189507860397496, + "learning_rate": 1.551633715134241e-05, + "loss": 0.669, + "step": 4313 + }, + { + "epoch": 0.33, + "grad_norm": 1.2859813047714257, + "learning_rate": 1.5514241192266786e-05, + "loss": 0.6141, + "step": 4314 + }, + { + "epoch": 0.33, + "grad_norm": 1.0726381637926647, + "learning_rate": 1.5512144885048664e-05, + "loss": 0.6105, + "step": 4315 + }, + { + "epoch": 0.33, + "grad_norm": 1.2531460748360561, + "learning_rate": 1.5510048229820398e-05, + "loss": 0.5823, + "step": 4316 + }, + { + "epoch": 0.33, + "grad_norm": 1.2201612078360424, + "learning_rate": 1.5507951226714356e-05, + "loss": 0.5963, + "step": 4317 + }, + { + "epoch": 0.33, + "grad_norm": 1.2030143686928403, + "learning_rate": 1.550585387586294e-05, + "loss": 0.6031, + "step": 4318 + }, + { + "epoch": 0.34, + "grad_norm": 1.1650962591720035, + "learning_rate": 1.550375617739856e-05, + "loss": 0.6219, + "step": 4319 + }, + { + "epoch": 0.34, + "grad_norm": 1.2110246196294916, + "learning_rate": 1.550165813145366e-05, + "loss": 0.5959, + "step": 4320 + }, + { + "epoch": 0.34, + "grad_norm": 1.2279511869868345, + "learning_rate": 1.5499559738160693e-05, + "loss": 0.6043, + "step": 4321 + }, + { + "epoch": 0.34, + "grad_norm": 1.063445568070926, + "learning_rate": 1.549746099765215e-05, + "loss": 0.5189, + "step": 4322 + }, + { + "epoch": 0.34, + "grad_norm": 1.1293839232741927, + "learning_rate": 1.5495361910060527e-05, + "loss": 0.5382, + "step": 4323 + }, + { + "epoch": 0.34, + "grad_norm": 1.1155763759755608, + "learning_rate": 1.5493262475518353e-05, + "loss": 0.6106, + "step": 4324 + }, + { + "epoch": 0.34, + "grad_norm": 1.1945891341960149, + "learning_rate": 1.5491162694158182e-05, + "loss": 0.5521, + "step": 4325 + }, + { + "epoch": 0.34, + "grad_norm": 1.3651839480275945, + "learning_rate": 1.548906256611258e-05, + "loss": 0.609, + "step": 4326 + }, + { + "epoch": 0.34, + "grad_norm": 1.2565710445133467, + "learning_rate": 1.5486962091514133e-05, + "loss": 0.6111, + "step": 4327 + }, + { + "epoch": 0.34, + "grad_norm": 1.1392565840530247, + "learning_rate": 1.5484861270495464e-05, + "loss": 0.5218, + "step": 4328 + }, + { + "epoch": 0.34, + "grad_norm": 1.2287283078022297, + "learning_rate": 1.5482760103189203e-05, + "loss": 0.6049, + "step": 4329 + }, + { + "epoch": 0.34, + "grad_norm": 1.2041847713829785, + "learning_rate": 1.548065858972801e-05, + "loss": 0.5831, + "step": 4330 + }, + { + "epoch": 0.34, + "grad_norm": 1.1843491966637465, + "learning_rate": 1.5478556730244564e-05, + "loss": 0.5631, + "step": 4331 + }, + { + "epoch": 0.34, + "grad_norm": 1.0813510560897177, + "learning_rate": 1.5476454524871566e-05, + "loss": 0.5709, + "step": 4332 + }, + { + "epoch": 0.34, + "grad_norm": 1.194321614364365, + "learning_rate": 1.5474351973741742e-05, + "loss": 0.6366, + "step": 4333 + }, + { + "epoch": 0.34, + "grad_norm": 1.2448822158801993, + "learning_rate": 1.547224907698783e-05, + "loss": 0.6267, + "step": 4334 + }, + { + "epoch": 0.34, + "grad_norm": 1.1498157768304422, + "learning_rate": 1.54701458347426e-05, + "loss": 0.5494, + "step": 4335 + }, + { + "epoch": 0.34, + "grad_norm": 1.0574603329877628, + "learning_rate": 1.5468042247138844e-05, + "loss": 0.5413, + "step": 4336 + }, + { + "epoch": 0.34, + "grad_norm": 1.1500462273967669, + "learning_rate": 1.5465938314309367e-05, + "loss": 0.5746, + "step": 4337 + }, + { + "epoch": 0.34, + "grad_norm": 1.1411223176558392, + "learning_rate": 1.546383403638701e-05, + "loss": 0.5566, + "step": 4338 + }, + { + "epoch": 0.34, + "grad_norm": 1.225857137029262, + "learning_rate": 1.5461729413504613e-05, + "loss": 0.6297, + "step": 4339 + }, + { + "epoch": 0.34, + "grad_norm": 1.0602756666311752, + "learning_rate": 1.5459624445795062e-05, + "loss": 0.5592, + "step": 4340 + }, + { + "epoch": 0.34, + "grad_norm": 1.113962172689552, + "learning_rate": 1.545751913339125e-05, + "loss": 0.5249, + "step": 4341 + }, + { + "epoch": 0.34, + "grad_norm": 1.2147959610840169, + "learning_rate": 1.54554134764261e-05, + "loss": 0.6144, + "step": 4342 + }, + { + "epoch": 0.34, + "grad_norm": 1.189400657661709, + "learning_rate": 1.5453307475032552e-05, + "loss": 0.5544, + "step": 4343 + }, + { + "epoch": 0.34, + "grad_norm": 1.1880256091931498, + "learning_rate": 1.5451201129343566e-05, + "loss": 0.5899, + "step": 4344 + }, + { + "epoch": 0.34, + "grad_norm": 1.1813865703774582, + "learning_rate": 1.544909443949213e-05, + "loss": 0.6005, + "step": 4345 + }, + { + "epoch": 0.34, + "grad_norm": 1.0560182441305859, + "learning_rate": 1.5446987405611248e-05, + "loss": 0.5167, + "step": 4346 + }, + { + "epoch": 0.34, + "grad_norm": 1.1708212119908372, + "learning_rate": 1.5444880027833947e-05, + "loss": 0.5678, + "step": 4347 + }, + { + "epoch": 0.34, + "grad_norm": 1.2678871189618748, + "learning_rate": 1.5442772306293277e-05, + "loss": 0.6456, + "step": 4348 + }, + { + "epoch": 0.34, + "grad_norm": 1.081308006097804, + "learning_rate": 1.5440664241122312e-05, + "loss": 0.5323, + "step": 4349 + }, + { + "epoch": 0.34, + "grad_norm": 1.2158441380383915, + "learning_rate": 1.5438555832454143e-05, + "loss": 0.6012, + "step": 4350 + }, + { + "epoch": 0.34, + "grad_norm": 1.1261988715655107, + "learning_rate": 1.5436447080421887e-05, + "loss": 0.6277, + "step": 4351 + }, + { + "epoch": 0.34, + "grad_norm": 1.1096401166454524, + "learning_rate": 1.5434337985158674e-05, + "loss": 0.5559, + "step": 4352 + }, + { + "epoch": 0.34, + "grad_norm": 1.2222174201254479, + "learning_rate": 1.5432228546797668e-05, + "loss": 0.5718, + "step": 4353 + }, + { + "epoch": 0.34, + "grad_norm": 1.2757120238640933, + "learning_rate": 1.5430118765472052e-05, + "loss": 0.582, + "step": 4354 + }, + { + "epoch": 0.34, + "grad_norm": 1.1499292973884567, + "learning_rate": 1.5428008641315018e-05, + "loss": 0.5528, + "step": 4355 + }, + { + "epoch": 0.34, + "grad_norm": 1.1802928457727033, + "learning_rate": 1.5425898174459794e-05, + "loss": 0.5814, + "step": 4356 + }, + { + "epoch": 0.34, + "grad_norm": 1.0787393782038581, + "learning_rate": 1.5423787365039627e-05, + "loss": 0.533, + "step": 4357 + }, + { + "epoch": 0.34, + "grad_norm": 1.2340041460588937, + "learning_rate": 1.5421676213187774e-05, + "loss": 0.6392, + "step": 4358 + }, + { + "epoch": 0.34, + "grad_norm": 1.2280317606002826, + "learning_rate": 1.5419564719037536e-05, + "loss": 0.6096, + "step": 4359 + }, + { + "epoch": 0.34, + "grad_norm": 1.248507371458292, + "learning_rate": 1.5417452882722214e-05, + "loss": 0.6251, + "step": 4360 + }, + { + "epoch": 0.34, + "grad_norm": 1.226481441079024, + "learning_rate": 1.541534070437514e-05, + "loss": 0.6905, + "step": 4361 + }, + { + "epoch": 0.34, + "grad_norm": 1.1224117174669375, + "learning_rate": 1.541322818412967e-05, + "loss": 0.5736, + "step": 4362 + }, + { + "epoch": 0.34, + "grad_norm": 1.1324407033154424, + "learning_rate": 1.5411115322119176e-05, + "loss": 0.5572, + "step": 4363 + }, + { + "epoch": 0.34, + "grad_norm": 1.25963105658449, + "learning_rate": 1.5409002118477053e-05, + "loss": 0.6141, + "step": 4364 + }, + { + "epoch": 0.34, + "grad_norm": 1.2681653465594676, + "learning_rate": 1.540688857333672e-05, + "loss": 0.599, + "step": 4365 + }, + { + "epoch": 0.34, + "grad_norm": 1.214661121440196, + "learning_rate": 1.5404774686831615e-05, + "loss": 0.6299, + "step": 4366 + }, + { + "epoch": 0.34, + "grad_norm": 1.21888746806788, + "learning_rate": 1.54026604590952e-05, + "loss": 0.5957, + "step": 4367 + }, + { + "epoch": 0.34, + "grad_norm": 1.1919438884412488, + "learning_rate": 1.540054589026095e-05, + "loss": 0.5919, + "step": 4368 + }, + { + "epoch": 0.34, + "grad_norm": 1.2466692895827691, + "learning_rate": 1.5398430980462382e-05, + "loss": 0.5912, + "step": 4369 + }, + { + "epoch": 0.34, + "grad_norm": 1.1907434339516478, + "learning_rate": 1.5396315729833015e-05, + "loss": 0.5307, + "step": 4370 + }, + { + "epoch": 0.34, + "grad_norm": 1.1197120046742197, + "learning_rate": 1.5394200138506393e-05, + "loss": 0.5414, + "step": 4371 + }, + { + "epoch": 0.34, + "grad_norm": 1.1677851708092752, + "learning_rate": 1.5392084206616084e-05, + "loss": 0.5809, + "step": 4372 + }, + { + "epoch": 0.34, + "grad_norm": 1.1437043206937372, + "learning_rate": 1.5389967934295677e-05, + "loss": 0.5991, + "step": 4373 + }, + { + "epoch": 0.34, + "grad_norm": 1.0779447543084313, + "learning_rate": 1.5387851321678788e-05, + "loss": 0.5676, + "step": 4374 + }, + { + "epoch": 0.34, + "grad_norm": 1.2527462355953578, + "learning_rate": 1.538573436889905e-05, + "loss": 0.6305, + "step": 4375 + }, + { + "epoch": 0.34, + "grad_norm": 1.1712196043257672, + "learning_rate": 1.5383617076090114e-05, + "loss": 0.6145, + "step": 4376 + }, + { + "epoch": 0.34, + "grad_norm": 1.1639391430516126, + "learning_rate": 1.5381499443385653e-05, + "loss": 0.5949, + "step": 4377 + }, + { + "epoch": 0.34, + "grad_norm": 1.1495743917624666, + "learning_rate": 1.537938147091937e-05, + "loss": 0.5887, + "step": 4378 + }, + { + "epoch": 0.34, + "grad_norm": 1.3407043273504264, + "learning_rate": 1.537726315882498e-05, + "loss": 0.6772, + "step": 4379 + }, + { + "epoch": 0.34, + "grad_norm": 1.250110859723331, + "learning_rate": 1.5375144507236222e-05, + "loss": 0.5646, + "step": 4380 + }, + { + "epoch": 0.34, + "grad_norm": 1.2708810526325784, + "learning_rate": 1.537302551628686e-05, + "loss": 0.6346, + "step": 4381 + }, + { + "epoch": 0.34, + "grad_norm": 1.0891656988169294, + "learning_rate": 1.5370906186110677e-05, + "loss": 0.5442, + "step": 4382 + }, + { + "epoch": 0.34, + "grad_norm": 1.2144545149281407, + "learning_rate": 1.536878651684148e-05, + "loss": 0.5854, + "step": 4383 + }, + { + "epoch": 0.34, + "grad_norm": 1.2673010853991138, + "learning_rate": 1.5366666508613083e-05, + "loss": 0.6416, + "step": 4384 + }, + { + "epoch": 0.34, + "grad_norm": 1.1962630537670147, + "learning_rate": 1.536454616155935e-05, + "loss": 0.5655, + "step": 4385 + }, + { + "epoch": 0.34, + "grad_norm": 1.2514959443834761, + "learning_rate": 1.5362425475814133e-05, + "loss": 0.5454, + "step": 4386 + }, + { + "epoch": 0.34, + "grad_norm": 1.1835336701406634, + "learning_rate": 1.5360304451511333e-05, + "loss": 0.5463, + "step": 4387 + }, + { + "epoch": 0.34, + "grad_norm": 1.2501789441770663, + "learning_rate": 1.5358183088784853e-05, + "loss": 0.5776, + "step": 4388 + }, + { + "epoch": 0.34, + "grad_norm": 1.2462837767926083, + "learning_rate": 1.5356061387768634e-05, + "loss": 0.5915, + "step": 4389 + }, + { + "epoch": 0.34, + "grad_norm": 1.1739182524903378, + "learning_rate": 1.535393934859663e-05, + "loss": 0.524, + "step": 4390 + }, + { + "epoch": 0.34, + "grad_norm": 1.230764153761204, + "learning_rate": 1.5351816971402803e-05, + "loss": 0.6316, + "step": 4391 + }, + { + "epoch": 0.34, + "grad_norm": 1.097198119213983, + "learning_rate": 1.5349694256321162e-05, + "loss": 0.5255, + "step": 4392 + }, + { + "epoch": 0.34, + "grad_norm": 1.0370746613923232, + "learning_rate": 1.5347571203485723e-05, + "loss": 0.5667, + "step": 4393 + }, + { + "epoch": 0.34, + "grad_norm": 1.0880541584370695, + "learning_rate": 1.5345447813030526e-05, + "loss": 0.5446, + "step": 4394 + }, + { + "epoch": 0.34, + "grad_norm": 1.2309443932035438, + "learning_rate": 1.5343324085089628e-05, + "loss": 0.6333, + "step": 4395 + }, + { + "epoch": 0.34, + "grad_norm": 1.1745729542887466, + "learning_rate": 1.534120001979711e-05, + "loss": 0.5948, + "step": 4396 + }, + { + "epoch": 0.34, + "grad_norm": 1.2211402537642444, + "learning_rate": 1.533907561728708e-05, + "loss": 0.5645, + "step": 4397 + }, + { + "epoch": 0.34, + "grad_norm": 1.1916706729812678, + "learning_rate": 1.533695087769366e-05, + "loss": 0.5642, + "step": 4398 + }, + { + "epoch": 0.34, + "grad_norm": 1.2430540217432164, + "learning_rate": 1.5334825801150998e-05, + "loss": 0.6135, + "step": 4399 + }, + { + "epoch": 0.34, + "grad_norm": 1.2546036346644387, + "learning_rate": 1.5332700387793255e-05, + "loss": 0.6196, + "step": 4400 + }, + { + "epoch": 0.34, + "grad_norm": 1.2020119311427646, + "learning_rate": 1.5330574637754627e-05, + "loss": 0.5501, + "step": 4401 + }, + { + "epoch": 0.34, + "grad_norm": 1.234787932269557, + "learning_rate": 1.5328448551169318e-05, + "loss": 0.6314, + "step": 4402 + }, + { + "epoch": 0.34, + "grad_norm": 1.063317657452814, + "learning_rate": 1.532632212817156e-05, + "loss": 0.5592, + "step": 4403 + }, + { + "epoch": 0.34, + "grad_norm": 1.1959654078281263, + "learning_rate": 1.532419536889561e-05, + "loss": 0.5993, + "step": 4404 + }, + { + "epoch": 0.34, + "grad_norm": 1.2815247683027342, + "learning_rate": 1.5322068273475737e-05, + "loss": 0.5572, + "step": 4405 + }, + { + "epoch": 0.34, + "grad_norm": 1.2080824142031188, + "learning_rate": 1.531994084204623e-05, + "loss": 0.5759, + "step": 4406 + }, + { + "epoch": 0.34, + "grad_norm": 1.131741036755641, + "learning_rate": 1.5317813074741415e-05, + "loss": 0.5293, + "step": 4407 + }, + { + "epoch": 0.34, + "grad_norm": 1.2769330965737027, + "learning_rate": 1.531568497169562e-05, + "loss": 0.5712, + "step": 4408 + }, + { + "epoch": 0.34, + "grad_norm": 1.2140100200214425, + "learning_rate": 1.5313556533043212e-05, + "loss": 0.6009, + "step": 4409 + }, + { + "epoch": 0.34, + "grad_norm": 1.2692971351141003, + "learning_rate": 1.5311427758918564e-05, + "loss": 0.6094, + "step": 4410 + }, + { + "epoch": 0.34, + "grad_norm": 1.1314977448357104, + "learning_rate": 1.5309298649456075e-05, + "loss": 0.5783, + "step": 4411 + }, + { + "epoch": 0.34, + "grad_norm": 1.164858705768735, + "learning_rate": 1.5307169204790174e-05, + "loss": 0.6357, + "step": 4412 + }, + { + "epoch": 0.34, + "grad_norm": 1.1666847238959757, + "learning_rate": 1.5305039425055302e-05, + "loss": 0.5311, + "step": 4413 + }, + { + "epoch": 0.34, + "grad_norm": 1.1897781003297598, + "learning_rate": 1.5302909310385916e-05, + "loss": 0.5929, + "step": 4414 + }, + { + "epoch": 0.34, + "grad_norm": 1.1927447195485912, + "learning_rate": 1.530077886091651e-05, + "loss": 0.5851, + "step": 4415 + }, + { + "epoch": 0.34, + "grad_norm": 1.1762167292074122, + "learning_rate": 1.5298648076781583e-05, + "loss": 0.6102, + "step": 4416 + }, + { + "epoch": 0.34, + "grad_norm": 1.0890326541001754, + "learning_rate": 1.5296516958115666e-05, + "loss": 0.5465, + "step": 4417 + }, + { + "epoch": 0.34, + "grad_norm": 1.2519195123644502, + "learning_rate": 1.5294385505053305e-05, + "loss": 0.5832, + "step": 4418 + }, + { + "epoch": 0.34, + "grad_norm": 1.232027646714422, + "learning_rate": 1.5292253717729072e-05, + "loss": 0.6447, + "step": 4419 + }, + { + "epoch": 0.34, + "grad_norm": 1.2657436268160225, + "learning_rate": 1.529012159627756e-05, + "loss": 0.6693, + "step": 4420 + }, + { + "epoch": 0.34, + "grad_norm": 1.210744042478969, + "learning_rate": 1.5287989140833376e-05, + "loss": 0.5477, + "step": 4421 + }, + { + "epoch": 0.34, + "grad_norm": 1.0917837725622568, + "learning_rate": 1.5285856351531157e-05, + "loss": 0.58, + "step": 4422 + }, + { + "epoch": 0.34, + "grad_norm": 1.148882046769742, + "learning_rate": 1.5283723228505552e-05, + "loss": 0.5378, + "step": 4423 + }, + { + "epoch": 0.34, + "grad_norm": 1.083430518168269, + "learning_rate": 1.5281589771891244e-05, + "loss": 0.4977, + "step": 4424 + }, + { + "epoch": 0.34, + "grad_norm": 1.1574323125208323, + "learning_rate": 1.527945598182292e-05, + "loss": 0.5491, + "step": 4425 + }, + { + "epoch": 0.34, + "grad_norm": 1.1496577105923107, + "learning_rate": 1.5277321858435303e-05, + "loss": 0.5835, + "step": 4426 + }, + { + "epoch": 0.34, + "grad_norm": 1.2745523545806017, + "learning_rate": 1.527518740186313e-05, + "loss": 0.6344, + "step": 4427 + }, + { + "epoch": 0.34, + "grad_norm": 1.2802791638977342, + "learning_rate": 1.527305261224116e-05, + "loss": 0.615, + "step": 4428 + }, + { + "epoch": 0.34, + "grad_norm": 1.148809671107842, + "learning_rate": 1.5270917489704173e-05, + "loss": 0.5432, + "step": 4429 + }, + { + "epoch": 0.34, + "grad_norm": 1.1369891287553175, + "learning_rate": 1.5268782034386972e-05, + "loss": 0.6384, + "step": 4430 + }, + { + "epoch": 0.34, + "grad_norm": 1.0942323438559076, + "learning_rate": 1.5266646246424374e-05, + "loss": 0.5141, + "step": 4431 + }, + { + "epoch": 0.34, + "grad_norm": 1.1011347921080916, + "learning_rate": 1.5264510125951228e-05, + "loss": 0.563, + "step": 4432 + }, + { + "epoch": 0.34, + "grad_norm": 1.088406178683373, + "learning_rate": 1.5262373673102396e-05, + "loss": 0.5599, + "step": 4433 + }, + { + "epoch": 0.34, + "grad_norm": 1.302010536702029, + "learning_rate": 1.5260236888012766e-05, + "loss": 0.6163, + "step": 4434 + }, + { + "epoch": 0.34, + "grad_norm": 1.1073075492822713, + "learning_rate": 1.5258099770817242e-05, + "loss": 0.5581, + "step": 4435 + }, + { + "epoch": 0.34, + "grad_norm": 1.146683579771115, + "learning_rate": 1.525596232165075e-05, + "loss": 0.5596, + "step": 4436 + }, + { + "epoch": 0.34, + "grad_norm": 1.170456752876025, + "learning_rate": 1.5253824540648237e-05, + "loss": 0.5416, + "step": 4437 + }, + { + "epoch": 0.34, + "grad_norm": 1.1076545807229496, + "learning_rate": 1.5251686427944679e-05, + "loss": 0.5367, + "step": 4438 + }, + { + "epoch": 0.34, + "grad_norm": 1.1734350882554954, + "learning_rate": 1.524954798367506e-05, + "loss": 0.563, + "step": 4439 + }, + { + "epoch": 0.34, + "grad_norm": 1.1514843856011263, + "learning_rate": 1.5247409207974394e-05, + "loss": 0.588, + "step": 4440 + }, + { + "epoch": 0.34, + "grad_norm": 1.2255016525892417, + "learning_rate": 1.5245270100977707e-05, + "loss": 0.578, + "step": 4441 + }, + { + "epoch": 0.34, + "grad_norm": 1.185576336414464, + "learning_rate": 1.5243130662820058e-05, + "loss": 0.5966, + "step": 4442 + }, + { + "epoch": 0.34, + "grad_norm": 1.0944559816106305, + "learning_rate": 1.5240990893636522e-05, + "loss": 0.6105, + "step": 4443 + }, + { + "epoch": 0.34, + "grad_norm": 1.1557597074503254, + "learning_rate": 1.523885079356219e-05, + "loss": 0.6533, + "step": 4444 + }, + { + "epoch": 0.34, + "grad_norm": 1.1560541451026403, + "learning_rate": 1.5236710362732178e-05, + "loss": 0.5969, + "step": 4445 + }, + { + "epoch": 0.34, + "grad_norm": 1.0838196225445664, + "learning_rate": 1.5234569601281623e-05, + "loss": 0.5379, + "step": 4446 + }, + { + "epoch": 0.34, + "grad_norm": 1.2456933218234425, + "learning_rate": 1.523242850934568e-05, + "loss": 0.5772, + "step": 4447 + }, + { + "epoch": 0.35, + "grad_norm": 1.1409657113174614, + "learning_rate": 1.5230287087059532e-05, + "loss": 0.5509, + "step": 4448 + }, + { + "epoch": 0.35, + "grad_norm": 1.100200054010176, + "learning_rate": 1.5228145334558377e-05, + "loss": 0.5919, + "step": 4449 + }, + { + "epoch": 0.35, + "grad_norm": 1.2097921553927675, + "learning_rate": 1.5226003251977432e-05, + "loss": 0.6066, + "step": 4450 + }, + { + "epoch": 0.35, + "grad_norm": 1.2463753122060177, + "learning_rate": 1.5223860839451935e-05, + "loss": 0.5836, + "step": 4451 + }, + { + "epoch": 0.35, + "grad_norm": 1.081399065006784, + "learning_rate": 1.5221718097117157e-05, + "loss": 0.5625, + "step": 4452 + }, + { + "epoch": 0.35, + "grad_norm": 1.176278246906274, + "learning_rate": 1.5219575025108373e-05, + "loss": 0.5509, + "step": 4453 + }, + { + "epoch": 0.35, + "grad_norm": 1.153852089842175, + "learning_rate": 1.521743162356089e-05, + "loss": 0.5421, + "step": 4454 + }, + { + "epoch": 0.35, + "grad_norm": 1.238385457849631, + "learning_rate": 1.521528789261003e-05, + "loss": 0.6216, + "step": 4455 + }, + { + "epoch": 0.35, + "grad_norm": 1.128811472573839, + "learning_rate": 1.5213143832391133e-05, + "loss": 0.5805, + "step": 4456 + }, + { + "epoch": 0.35, + "grad_norm": 1.2818857337059608, + "learning_rate": 1.5210999443039573e-05, + "loss": 0.5872, + "step": 4457 + }, + { + "epoch": 0.35, + "grad_norm": 1.3346661323796543, + "learning_rate": 1.5208854724690734e-05, + "loss": 0.6064, + "step": 4458 + }, + { + "epoch": 0.35, + "grad_norm": 1.055467105528188, + "learning_rate": 1.5206709677480022e-05, + "loss": 0.5526, + "step": 4459 + }, + { + "epoch": 0.35, + "grad_norm": 1.1356789988736544, + "learning_rate": 1.5204564301542863e-05, + "loss": 0.5718, + "step": 4460 + }, + { + "epoch": 0.35, + "grad_norm": 1.159306070175738, + "learning_rate": 1.520241859701471e-05, + "loss": 0.6041, + "step": 4461 + }, + { + "epoch": 0.35, + "grad_norm": 1.1956427124274813, + "learning_rate": 1.5200272564031026e-05, + "loss": 0.5949, + "step": 4462 + }, + { + "epoch": 0.35, + "grad_norm": 1.2159479648725056, + "learning_rate": 1.5198126202727311e-05, + "loss": 0.6024, + "step": 4463 + }, + { + "epoch": 0.35, + "grad_norm": 1.1560298091417576, + "learning_rate": 1.5195979513239064e-05, + "loss": 0.5788, + "step": 4464 + }, + { + "epoch": 0.35, + "grad_norm": 1.1778144872440648, + "learning_rate": 1.5193832495701825e-05, + "loss": 0.5278, + "step": 4465 + }, + { + "epoch": 0.35, + "grad_norm": 1.168228732871617, + "learning_rate": 1.5191685150251146e-05, + "loss": 0.586, + "step": 4466 + }, + { + "epoch": 0.35, + "grad_norm": 1.2033463806984266, + "learning_rate": 1.5189537477022595e-05, + "loss": 0.6291, + "step": 4467 + }, + { + "epoch": 0.35, + "grad_norm": 1.0862332497063598, + "learning_rate": 1.518738947615177e-05, + "loss": 0.5285, + "step": 4468 + }, + { + "epoch": 0.35, + "grad_norm": 1.1921114976087672, + "learning_rate": 1.5185241147774283e-05, + "loss": 0.6343, + "step": 4469 + }, + { + "epoch": 0.35, + "grad_norm": 1.0100118016035045, + "learning_rate": 1.5183092492025772e-05, + "loss": 0.5398, + "step": 4470 + }, + { + "epoch": 0.35, + "grad_norm": 1.2192429254861237, + "learning_rate": 1.518094350904189e-05, + "loss": 0.5893, + "step": 4471 + }, + { + "epoch": 0.35, + "grad_norm": 1.0759556270937056, + "learning_rate": 1.5178794198958313e-05, + "loss": 0.5805, + "step": 4472 + }, + { + "epoch": 0.35, + "grad_norm": 1.2030424114466285, + "learning_rate": 1.517664456191074e-05, + "loss": 0.6144, + "step": 4473 + }, + { + "epoch": 0.35, + "grad_norm": 1.2887942873192972, + "learning_rate": 1.5174494598034889e-05, + "loss": 0.6429, + "step": 4474 + }, + { + "epoch": 0.35, + "grad_norm": 1.2304358583928532, + "learning_rate": 1.5172344307466493e-05, + "loss": 0.6387, + "step": 4475 + }, + { + "epoch": 0.35, + "grad_norm": 1.210997106254138, + "learning_rate": 1.517019369034132e-05, + "loss": 0.6773, + "step": 4476 + }, + { + "epoch": 0.35, + "grad_norm": 1.1317197066917521, + "learning_rate": 1.516804274679514e-05, + "loss": 0.573, + "step": 4477 + }, + { + "epoch": 0.35, + "grad_norm": 1.226489799920522, + "learning_rate": 1.5165891476963763e-05, + "loss": 0.5784, + "step": 4478 + }, + { + "epoch": 0.35, + "grad_norm": 1.098668673459862, + "learning_rate": 1.5163739880983002e-05, + "loss": 0.5126, + "step": 4479 + }, + { + "epoch": 0.35, + "grad_norm": 1.2200222222070949, + "learning_rate": 1.5161587958988699e-05, + "loss": 0.6562, + "step": 4480 + }, + { + "epoch": 0.35, + "grad_norm": 1.1494085656928357, + "learning_rate": 1.515943571111672e-05, + "loss": 0.6214, + "step": 4481 + }, + { + "epoch": 0.35, + "grad_norm": 1.0634176554973584, + "learning_rate": 1.5157283137502944e-05, + "loss": 0.5118, + "step": 4482 + }, + { + "epoch": 0.35, + "grad_norm": 1.1172460260600112, + "learning_rate": 1.5155130238283277e-05, + "loss": 0.5477, + "step": 4483 + }, + { + "epoch": 0.35, + "grad_norm": 1.119030479350138, + "learning_rate": 1.5152977013593643e-05, + "loss": 0.5482, + "step": 4484 + }, + { + "epoch": 0.35, + "grad_norm": 1.2970108972149006, + "learning_rate": 1.5150823463569979e-05, + "loss": 0.5596, + "step": 4485 + }, + { + "epoch": 0.35, + "grad_norm": 1.1490856599276307, + "learning_rate": 1.514866958834826e-05, + "loss": 0.5687, + "step": 4486 + }, + { + "epoch": 0.35, + "grad_norm": 1.1965998280984946, + "learning_rate": 1.5146515388064463e-05, + "loss": 0.5549, + "step": 4487 + }, + { + "epoch": 0.35, + "grad_norm": 1.1337593593862998, + "learning_rate": 1.5144360862854597e-05, + "loss": 0.564, + "step": 4488 + }, + { + "epoch": 0.35, + "grad_norm": 1.1684703961639018, + "learning_rate": 1.5142206012854693e-05, + "loss": 0.6251, + "step": 4489 + }, + { + "epoch": 0.35, + "grad_norm": 1.1579033140327963, + "learning_rate": 1.5140050838200786e-05, + "loss": 0.6207, + "step": 4490 + }, + { + "epoch": 0.35, + "grad_norm": 1.3167438470375483, + "learning_rate": 1.5137895339028955e-05, + "loss": 0.599, + "step": 4491 + }, + { + "epoch": 0.35, + "grad_norm": 1.2358226737250553, + "learning_rate": 1.5135739515475281e-05, + "loss": 0.523, + "step": 4492 + }, + { + "epoch": 0.35, + "grad_norm": 1.2380342447755142, + "learning_rate": 1.5133583367675878e-05, + "loss": 0.5884, + "step": 4493 + }, + { + "epoch": 0.35, + "grad_norm": 1.2101236377161075, + "learning_rate": 1.5131426895766868e-05, + "loss": 0.5678, + "step": 4494 + }, + { + "epoch": 0.35, + "grad_norm": 1.2288445303830322, + "learning_rate": 1.5129270099884403e-05, + "loss": 0.5841, + "step": 4495 + }, + { + "epoch": 0.35, + "grad_norm": 1.54702295693919, + "learning_rate": 1.5127112980164655e-05, + "loss": 0.6066, + "step": 4496 + }, + { + "epoch": 0.35, + "grad_norm": 1.113243236227367, + "learning_rate": 1.512495553674381e-05, + "loss": 0.591, + "step": 4497 + }, + { + "epoch": 0.35, + "grad_norm": 1.1566739207341692, + "learning_rate": 1.5122797769758081e-05, + "loss": 0.5457, + "step": 4498 + }, + { + "epoch": 0.35, + "grad_norm": 1.0650606154264042, + "learning_rate": 1.5120639679343702e-05, + "loss": 0.5679, + "step": 4499 + }, + { + "epoch": 0.35, + "grad_norm": 1.1669668027279647, + "learning_rate": 1.5118481265636917e-05, + "loss": 0.5665, + "step": 4500 + }, + { + "epoch": 0.35, + "grad_norm": 1.2619587106131802, + "learning_rate": 1.5116322528774005e-05, + "loss": 0.5949, + "step": 4501 + }, + { + "epoch": 0.35, + "grad_norm": 1.2086465144706862, + "learning_rate": 1.5114163468891252e-05, + "loss": 0.5736, + "step": 4502 + }, + { + "epoch": 0.35, + "grad_norm": 1.182223795596568, + "learning_rate": 1.5112004086124976e-05, + "loss": 0.5736, + "step": 4503 + }, + { + "epoch": 0.35, + "grad_norm": 1.1786402418021102, + "learning_rate": 1.5109844380611506e-05, + "loss": 0.5935, + "step": 4504 + }, + { + "epoch": 0.35, + "grad_norm": 1.2747217267363526, + "learning_rate": 1.51076843524872e-05, + "loss": 0.5938, + "step": 4505 + }, + { + "epoch": 0.35, + "grad_norm": 1.2519558863378772, + "learning_rate": 1.5105524001888425e-05, + "loss": 0.6118, + "step": 4506 + }, + { + "epoch": 0.35, + "grad_norm": 1.2270405317698923, + "learning_rate": 1.510336332895158e-05, + "loss": 0.6242, + "step": 4507 + }, + { + "epoch": 0.35, + "grad_norm": 1.1195539473517495, + "learning_rate": 1.5101202333813078e-05, + "loss": 0.5322, + "step": 4508 + }, + { + "epoch": 0.35, + "grad_norm": 1.1557193775745187, + "learning_rate": 1.5099041016609355e-05, + "loss": 0.5888, + "step": 4509 + }, + { + "epoch": 0.35, + "grad_norm": 1.1709091784165144, + "learning_rate": 1.5096879377476864e-05, + "loss": 0.5435, + "step": 4510 + }, + { + "epoch": 0.35, + "grad_norm": 1.1386916615253955, + "learning_rate": 1.509471741655208e-05, + "loss": 0.5176, + "step": 4511 + }, + { + "epoch": 0.35, + "grad_norm": 1.200871326093473, + "learning_rate": 1.5092555133971502e-05, + "loss": 0.6216, + "step": 4512 + }, + { + "epoch": 0.35, + "grad_norm": 1.2124918652291095, + "learning_rate": 1.5090392529871645e-05, + "loss": 0.5986, + "step": 4513 + }, + { + "epoch": 0.35, + "grad_norm": 1.1511681200994655, + "learning_rate": 1.5088229604389045e-05, + "loss": 0.5848, + "step": 4514 + }, + { + "epoch": 0.35, + "grad_norm": 1.2295573901278383, + "learning_rate": 1.5086066357660255e-05, + "loss": 0.5798, + "step": 4515 + }, + { + "epoch": 0.35, + "grad_norm": 1.0988644504068925, + "learning_rate": 1.5083902789821854e-05, + "loss": 0.6043, + "step": 4516 + }, + { + "epoch": 0.35, + "grad_norm": 1.225616819440978, + "learning_rate": 1.5081738901010446e-05, + "loss": 0.5765, + "step": 4517 + }, + { + "epoch": 0.35, + "grad_norm": 1.1822751698064238, + "learning_rate": 1.507957469136264e-05, + "loss": 0.6063, + "step": 4518 + }, + { + "epoch": 0.35, + "grad_norm": 1.113194244732716, + "learning_rate": 1.5077410161015078e-05, + "loss": 0.5454, + "step": 4519 + }, + { + "epoch": 0.35, + "grad_norm": 1.2642688785506542, + "learning_rate": 1.5075245310104414e-05, + "loss": 0.6169, + "step": 4520 + }, + { + "epoch": 0.35, + "grad_norm": 1.1296981621076856, + "learning_rate": 1.507308013876733e-05, + "loss": 0.5632, + "step": 4521 + }, + { + "epoch": 0.35, + "grad_norm": 1.2368674405152718, + "learning_rate": 1.5070914647140522e-05, + "loss": 0.6307, + "step": 4522 + }, + { + "epoch": 0.35, + "grad_norm": 1.2112597129125278, + "learning_rate": 1.5068748835360713e-05, + "loss": 0.5861, + "step": 4523 + }, + { + "epoch": 0.35, + "grad_norm": 1.219331749208178, + "learning_rate": 1.5066582703564638e-05, + "loss": 0.6005, + "step": 4524 + }, + { + "epoch": 0.35, + "grad_norm": 1.2052131960459438, + "learning_rate": 1.5064416251889053e-05, + "loss": 0.6435, + "step": 4525 + }, + { + "epoch": 0.35, + "grad_norm": 1.123783566543255, + "learning_rate": 1.5062249480470742e-05, + "loss": 0.5214, + "step": 4526 + }, + { + "epoch": 0.35, + "grad_norm": 1.1489162355505493, + "learning_rate": 1.5060082389446509e-05, + "loss": 0.5995, + "step": 4527 + }, + { + "epoch": 0.35, + "grad_norm": 1.1174787128477695, + "learning_rate": 1.5057914978953166e-05, + "loss": 0.5736, + "step": 4528 + }, + { + "epoch": 0.35, + "grad_norm": 1.1073052884852255, + "learning_rate": 1.5055747249127552e-05, + "loss": 0.611, + "step": 4529 + }, + { + "epoch": 0.35, + "grad_norm": 1.0962024985518966, + "learning_rate": 1.5053579200106531e-05, + "loss": 0.5769, + "step": 4530 + }, + { + "epoch": 0.35, + "grad_norm": 1.175217756425074, + "learning_rate": 1.505141083202698e-05, + "loss": 0.5486, + "step": 4531 + }, + { + "epoch": 0.35, + "grad_norm": 1.1204931797124718, + "learning_rate": 1.5049242145025806e-05, + "loss": 0.6081, + "step": 4532 + }, + { + "epoch": 0.35, + "grad_norm": 1.1347182981303137, + "learning_rate": 1.5047073139239922e-05, + "loss": 0.6092, + "step": 4533 + }, + { + "epoch": 0.35, + "grad_norm": 1.2278380352344656, + "learning_rate": 1.5044903814806273e-05, + "loss": 0.5842, + "step": 4534 + }, + { + "epoch": 0.35, + "grad_norm": 1.2165225777590694, + "learning_rate": 1.5042734171861815e-05, + "loss": 0.5976, + "step": 4535 + }, + { + "epoch": 0.35, + "grad_norm": 1.194584843173679, + "learning_rate": 1.5040564210543532e-05, + "loss": 0.5858, + "step": 4536 + }, + { + "epoch": 0.35, + "grad_norm": 1.0587298879323264, + "learning_rate": 1.5038393930988426e-05, + "loss": 0.5562, + "step": 4537 + }, + { + "epoch": 0.35, + "grad_norm": 1.2387121756405723, + "learning_rate": 1.5036223333333517e-05, + "loss": 0.6154, + "step": 4538 + }, + { + "epoch": 0.35, + "grad_norm": 1.1080072056987036, + "learning_rate": 1.5034052417715846e-05, + "loss": 0.6061, + "step": 4539 + }, + { + "epoch": 0.35, + "grad_norm": 1.07299707475603, + "learning_rate": 1.503188118427247e-05, + "loss": 0.4944, + "step": 4540 + }, + { + "epoch": 0.35, + "grad_norm": 1.1760255173328118, + "learning_rate": 1.5029709633140476e-05, + "loss": 0.5106, + "step": 4541 + }, + { + "epoch": 0.35, + "grad_norm": 1.1956653448178554, + "learning_rate": 1.5027537764456963e-05, + "loss": 0.5875, + "step": 4542 + }, + { + "epoch": 0.35, + "grad_norm": 1.1648337862035085, + "learning_rate": 1.5025365578359053e-05, + "loss": 0.5466, + "step": 4543 + }, + { + "epoch": 0.35, + "grad_norm": 1.237878679848045, + "learning_rate": 1.5023193074983886e-05, + "loss": 0.6518, + "step": 4544 + }, + { + "epoch": 0.35, + "grad_norm": 1.07618068130684, + "learning_rate": 1.5021020254468623e-05, + "loss": 0.5112, + "step": 4545 + }, + { + "epoch": 0.35, + "grad_norm": 1.1895439727114845, + "learning_rate": 1.5018847116950445e-05, + "loss": 0.6323, + "step": 4546 + }, + { + "epoch": 0.35, + "grad_norm": 1.193688529863557, + "learning_rate": 1.5016673662566558e-05, + "loss": 0.5816, + "step": 4547 + }, + { + "epoch": 0.35, + "grad_norm": 1.2262444539387036, + "learning_rate": 1.501449989145418e-05, + "loss": 0.6077, + "step": 4548 + }, + { + "epoch": 0.35, + "grad_norm": 1.1452765528097446, + "learning_rate": 1.501232580375055e-05, + "loss": 0.6007, + "step": 4549 + }, + { + "epoch": 0.35, + "grad_norm": 1.173722857485436, + "learning_rate": 1.5010151399592934e-05, + "loss": 0.5908, + "step": 4550 + }, + { + "epoch": 0.35, + "grad_norm": 1.285188204939091, + "learning_rate": 1.500797667911861e-05, + "loss": 0.6214, + "step": 4551 + }, + { + "epoch": 0.35, + "grad_norm": 1.233078384253019, + "learning_rate": 1.5005801642464879e-05, + "loss": 0.57, + "step": 4552 + }, + { + "epoch": 0.35, + "grad_norm": 1.1913349568032714, + "learning_rate": 1.5003626289769066e-05, + "loss": 0.6218, + "step": 4553 + }, + { + "epoch": 0.35, + "grad_norm": 1.1941912010496085, + "learning_rate": 1.5001450621168507e-05, + "loss": 0.5583, + "step": 4554 + }, + { + "epoch": 0.35, + "grad_norm": 1.332888106675841, + "learning_rate": 1.4999274636800572e-05, + "loss": 0.6261, + "step": 4555 + }, + { + "epoch": 0.35, + "grad_norm": 1.1712951751595282, + "learning_rate": 1.4997098336802631e-05, + "loss": 0.5899, + "step": 4556 + }, + { + "epoch": 0.35, + "grad_norm": 1.0782279919068292, + "learning_rate": 1.4994921721312092e-05, + "loss": 0.5431, + "step": 4557 + }, + { + "epoch": 0.35, + "grad_norm": 1.128441790961086, + "learning_rate": 1.4992744790466376e-05, + "loss": 0.5064, + "step": 4558 + }, + { + "epoch": 0.35, + "grad_norm": 1.3008492025319331, + "learning_rate": 1.4990567544402918e-05, + "loss": 0.6312, + "step": 4559 + }, + { + "epoch": 0.35, + "grad_norm": 1.18534670904363, + "learning_rate": 1.4988389983259188e-05, + "loss": 0.537, + "step": 4560 + }, + { + "epoch": 0.35, + "grad_norm": 1.1484538485862017, + "learning_rate": 1.4986212107172658e-05, + "loss": 0.5871, + "step": 4561 + }, + { + "epoch": 0.35, + "grad_norm": 1.1217149349349158, + "learning_rate": 1.4984033916280833e-05, + "loss": 0.5938, + "step": 4562 + }, + { + "epoch": 0.35, + "grad_norm": 1.1819202943350071, + "learning_rate": 1.4981855410721236e-05, + "loss": 0.5871, + "step": 4563 + }, + { + "epoch": 0.35, + "grad_norm": 1.1638147999457185, + "learning_rate": 1.4979676590631398e-05, + "loss": 0.6018, + "step": 4564 + }, + { + "epoch": 0.35, + "grad_norm": 1.1778014813942903, + "learning_rate": 1.4977497456148891e-05, + "loss": 0.5923, + "step": 4565 + }, + { + "epoch": 0.35, + "grad_norm": 1.099815986153892, + "learning_rate": 1.4975318007411284e-05, + "loss": 0.507, + "step": 4566 + }, + { + "epoch": 0.35, + "grad_norm": 1.0380142775713128, + "learning_rate": 1.4973138244556184e-05, + "loss": 0.482, + "step": 4567 + }, + { + "epoch": 0.35, + "grad_norm": 1.217544203705804, + "learning_rate": 1.497095816772121e-05, + "loss": 0.6523, + "step": 4568 + }, + { + "epoch": 0.35, + "grad_norm": 1.1779864849552317, + "learning_rate": 1.4968777777043997e-05, + "loss": 0.5472, + "step": 4569 + }, + { + "epoch": 0.35, + "grad_norm": 1.189498424450415, + "learning_rate": 1.496659707266221e-05, + "loss": 0.538, + "step": 4570 + }, + { + "epoch": 0.35, + "grad_norm": 1.19365597299684, + "learning_rate": 1.4964416054713525e-05, + "loss": 0.5788, + "step": 4571 + }, + { + "epoch": 0.35, + "grad_norm": 1.2080286836787093, + "learning_rate": 1.4962234723335642e-05, + "loss": 0.5859, + "step": 4572 + }, + { + "epoch": 0.35, + "grad_norm": 1.1123784738236282, + "learning_rate": 1.4960053078666278e-05, + "loss": 0.5111, + "step": 4573 + }, + { + "epoch": 0.35, + "grad_norm": 1.134893886126242, + "learning_rate": 1.4957871120843172e-05, + "loss": 0.5944, + "step": 4574 + }, + { + "epoch": 0.35, + "grad_norm": 1.1983506729523272, + "learning_rate": 1.4955688850004087e-05, + "loss": 0.5461, + "step": 4575 + }, + { + "epoch": 0.35, + "grad_norm": 1.2331431072469832, + "learning_rate": 1.495350626628679e-05, + "loss": 0.6081, + "step": 4576 + }, + { + "epoch": 0.36, + "grad_norm": 1.217480756619316, + "learning_rate": 1.4951323369829091e-05, + "loss": 0.5024, + "step": 4577 + }, + { + "epoch": 0.36, + "grad_norm": 1.130900275194096, + "learning_rate": 1.4949140160768803e-05, + "loss": 0.5241, + "step": 4578 + }, + { + "epoch": 0.36, + "grad_norm": 1.0714250893763317, + "learning_rate": 1.4946956639243757e-05, + "loss": 0.5332, + "step": 4579 + }, + { + "epoch": 0.36, + "grad_norm": 1.1859485128140161, + "learning_rate": 1.4944772805391821e-05, + "loss": 0.5833, + "step": 4580 + }, + { + "epoch": 0.36, + "grad_norm": 1.1704996812405288, + "learning_rate": 1.4942588659350863e-05, + "loss": 0.5798, + "step": 4581 + }, + { + "epoch": 0.36, + "grad_norm": 1.1671876879261687, + "learning_rate": 1.4940404201258782e-05, + "loss": 0.5772, + "step": 4582 + }, + { + "epoch": 0.36, + "grad_norm": 1.2348541102553379, + "learning_rate": 1.4938219431253499e-05, + "loss": 0.6385, + "step": 4583 + }, + { + "epoch": 0.36, + "grad_norm": 1.1598463061886657, + "learning_rate": 1.4936034349472941e-05, + "loss": 0.5616, + "step": 4584 + }, + { + "epoch": 0.36, + "grad_norm": 1.1316569781743098, + "learning_rate": 1.4933848956055068e-05, + "loss": 0.5603, + "step": 4585 + }, + { + "epoch": 0.36, + "grad_norm": 1.1578602789983201, + "learning_rate": 1.4931663251137856e-05, + "loss": 0.5509, + "step": 4586 + }, + { + "epoch": 0.36, + "grad_norm": 1.0901857885334052, + "learning_rate": 1.4929477234859299e-05, + "loss": 0.5413, + "step": 4587 + }, + { + "epoch": 0.36, + "grad_norm": 1.1502188536504823, + "learning_rate": 1.4927290907357415e-05, + "loss": 0.5587, + "step": 4588 + }, + { + "epoch": 0.36, + "grad_norm": 1.2573429439594714, + "learning_rate": 1.4925104268770227e-05, + "loss": 0.6477, + "step": 4589 + }, + { + "epoch": 0.36, + "grad_norm": 1.0957070416875185, + "learning_rate": 1.4922917319235804e-05, + "loss": 0.5368, + "step": 4590 + }, + { + "epoch": 0.36, + "grad_norm": 1.1902997491283291, + "learning_rate": 1.4920730058892205e-05, + "loss": 0.585, + "step": 4591 + }, + { + "epoch": 0.36, + "grad_norm": 1.2319059183623426, + "learning_rate": 1.4918542487877535e-05, + "loss": 0.592, + "step": 4592 + }, + { + "epoch": 0.36, + "grad_norm": 1.1735340327596926, + "learning_rate": 1.49163546063299e-05, + "loss": 0.5087, + "step": 4593 + }, + { + "epoch": 0.36, + "grad_norm": 1.2132498045000901, + "learning_rate": 1.4914166414387433e-05, + "loss": 0.5851, + "step": 4594 + }, + { + "epoch": 0.36, + "grad_norm": 1.2265314960813345, + "learning_rate": 1.4911977912188284e-05, + "loss": 0.5568, + "step": 4595 + }, + { + "epoch": 0.36, + "grad_norm": 1.0343460078944768, + "learning_rate": 1.490978909987063e-05, + "loss": 0.5331, + "step": 4596 + }, + { + "epoch": 0.36, + "grad_norm": 1.2338777820307305, + "learning_rate": 1.4907599977572659e-05, + "loss": 0.586, + "step": 4597 + }, + { + "epoch": 0.36, + "grad_norm": 1.1826844200219562, + "learning_rate": 1.490541054543258e-05, + "loss": 0.5681, + "step": 4598 + }, + { + "epoch": 0.36, + "grad_norm": 1.2097961954030196, + "learning_rate": 1.4903220803588627e-05, + "loss": 0.5671, + "step": 4599 + }, + { + "epoch": 0.36, + "grad_norm": 1.2305487228320586, + "learning_rate": 1.4901030752179044e-05, + "loss": 0.6165, + "step": 4600 + }, + { + "epoch": 0.36, + "grad_norm": 1.2404697945702465, + "learning_rate": 1.4898840391342107e-05, + "loss": 0.5561, + "step": 4601 + }, + { + "epoch": 0.36, + "grad_norm": 1.1278644757596286, + "learning_rate": 1.4896649721216101e-05, + "loss": 0.5668, + "step": 4602 + }, + { + "epoch": 0.36, + "grad_norm": 1.0960607370093876, + "learning_rate": 1.4894458741939333e-05, + "loss": 0.585, + "step": 4603 + }, + { + "epoch": 0.36, + "grad_norm": 1.1829674199794296, + "learning_rate": 1.4892267453650133e-05, + "loss": 0.5935, + "step": 4604 + }, + { + "epoch": 0.36, + "grad_norm": 1.0172343487945301, + "learning_rate": 1.4890075856486848e-05, + "loss": 0.4999, + "step": 4605 + }, + { + "epoch": 0.36, + "grad_norm": 1.2785751986238711, + "learning_rate": 1.4887883950587845e-05, + "loss": 0.5755, + "step": 4606 + }, + { + "epoch": 0.36, + "grad_norm": 1.2564671115297674, + "learning_rate": 1.488569173609151e-05, + "loss": 0.5793, + "step": 4607 + }, + { + "epoch": 0.36, + "grad_norm": 1.2780503118279807, + "learning_rate": 1.488349921313625e-05, + "loss": 0.5999, + "step": 4608 + }, + { + "epoch": 0.36, + "grad_norm": 1.1265165914845163, + "learning_rate": 1.4881306381860485e-05, + "loss": 0.5823, + "step": 4609 + }, + { + "epoch": 0.36, + "grad_norm": 1.1812260176227336, + "learning_rate": 1.4879113242402668e-05, + "loss": 0.5505, + "step": 4610 + }, + { + "epoch": 0.36, + "grad_norm": 1.1186908663673416, + "learning_rate": 1.4876919794901256e-05, + "loss": 0.5666, + "step": 4611 + }, + { + "epoch": 0.36, + "grad_norm": 1.1935460620456722, + "learning_rate": 1.487472603949474e-05, + "loss": 0.5874, + "step": 4612 + }, + { + "epoch": 0.36, + "grad_norm": 1.2067492998309621, + "learning_rate": 1.4872531976321619e-05, + "loss": 0.655, + "step": 4613 + }, + { + "epoch": 0.36, + "grad_norm": 1.104979174145839, + "learning_rate": 1.4870337605520408e-05, + "loss": 0.5547, + "step": 4614 + }, + { + "epoch": 0.36, + "grad_norm": 1.0634184962482276, + "learning_rate": 1.4868142927229662e-05, + "loss": 0.5519, + "step": 4615 + }, + { + "epoch": 0.36, + "grad_norm": 1.1808072264447613, + "learning_rate": 1.4865947941587938e-05, + "loss": 0.6282, + "step": 4616 + }, + { + "epoch": 0.36, + "grad_norm": 1.2570182235355767, + "learning_rate": 1.4863752648733812e-05, + "loss": 0.5545, + "step": 4617 + }, + { + "epoch": 0.36, + "grad_norm": 1.2856973361230202, + "learning_rate": 1.486155704880589e-05, + "loss": 0.675, + "step": 4618 + }, + { + "epoch": 0.36, + "grad_norm": 1.3064167646339222, + "learning_rate": 1.4859361141942788e-05, + "loss": 0.6257, + "step": 4619 + }, + { + "epoch": 0.36, + "grad_norm": 1.1636661136275848, + "learning_rate": 1.4857164928283143e-05, + "loss": 0.5525, + "step": 4620 + }, + { + "epoch": 0.36, + "grad_norm": 1.2044451023162963, + "learning_rate": 1.4854968407965621e-05, + "loss": 0.589, + "step": 4621 + }, + { + "epoch": 0.36, + "grad_norm": 1.4738054319768479, + "learning_rate": 1.4852771581128895e-05, + "loss": 0.6303, + "step": 4622 + }, + { + "epoch": 0.36, + "grad_norm": 1.1010681558175648, + "learning_rate": 1.4850574447911661e-05, + "loss": 0.5883, + "step": 4623 + }, + { + "epoch": 0.36, + "grad_norm": 1.1235184450659916, + "learning_rate": 1.4848377008452635e-05, + "loss": 0.5364, + "step": 4624 + }, + { + "epoch": 0.36, + "grad_norm": 1.2718744338875525, + "learning_rate": 1.4846179262890554e-05, + "loss": 0.6048, + "step": 4625 + }, + { + "epoch": 0.36, + "grad_norm": 1.236600294482748, + "learning_rate": 1.4843981211364175e-05, + "loss": 0.6051, + "step": 4626 + }, + { + "epoch": 0.36, + "grad_norm": 1.1642824515615444, + "learning_rate": 1.484178285401227e-05, + "loss": 0.5463, + "step": 4627 + }, + { + "epoch": 0.36, + "grad_norm": 1.2228783353986021, + "learning_rate": 1.4839584190973633e-05, + "loss": 0.5762, + "step": 4628 + }, + { + "epoch": 0.36, + "grad_norm": 1.1734859837498324, + "learning_rate": 1.4837385222387078e-05, + "loss": 0.5542, + "step": 4629 + }, + { + "epoch": 0.36, + "grad_norm": 1.0824503723820524, + "learning_rate": 1.4835185948391433e-05, + "loss": 0.6399, + "step": 4630 + }, + { + "epoch": 0.36, + "grad_norm": 1.1092326650513769, + "learning_rate": 1.4832986369125558e-05, + "loss": 0.5604, + "step": 4631 + }, + { + "epoch": 0.36, + "grad_norm": 1.3168809527247678, + "learning_rate": 1.4830786484728315e-05, + "loss": 0.6606, + "step": 4632 + }, + { + "epoch": 0.36, + "grad_norm": 1.1675323385460907, + "learning_rate": 1.4828586295338597e-05, + "loss": 0.5924, + "step": 4633 + }, + { + "epoch": 0.36, + "grad_norm": 1.1717783061507685, + "learning_rate": 1.4826385801095315e-05, + "loss": 0.5686, + "step": 4634 + }, + { + "epoch": 0.36, + "grad_norm": 1.197491174956039, + "learning_rate": 1.4824185002137396e-05, + "loss": 0.5732, + "step": 4635 + }, + { + "epoch": 0.36, + "grad_norm": 1.130614997140509, + "learning_rate": 1.4821983898603791e-05, + "loss": 0.5606, + "step": 4636 + }, + { + "epoch": 0.36, + "grad_norm": 1.1701476527237085, + "learning_rate": 1.4819782490633463e-05, + "loss": 0.5743, + "step": 4637 + }, + { + "epoch": 0.36, + "grad_norm": 1.1435780380301737, + "learning_rate": 1.4817580778365396e-05, + "loss": 0.5634, + "step": 4638 + }, + { + "epoch": 0.36, + "grad_norm": 1.2038043692601252, + "learning_rate": 1.4815378761938603e-05, + "loss": 0.6005, + "step": 4639 + }, + { + "epoch": 0.36, + "grad_norm": 1.1275125632252232, + "learning_rate": 1.4813176441492104e-05, + "loss": 0.5623, + "step": 4640 + }, + { + "epoch": 0.36, + "grad_norm": 1.1594266813126217, + "learning_rate": 1.4810973817164941e-05, + "loss": 0.5531, + "step": 4641 + }, + { + "epoch": 0.36, + "grad_norm": 1.2656768505636797, + "learning_rate": 1.4808770889096184e-05, + "loss": 0.6042, + "step": 4642 + }, + { + "epoch": 0.36, + "grad_norm": 1.1705646564105916, + "learning_rate": 1.4806567657424908e-05, + "loss": 0.5571, + "step": 4643 + }, + { + "epoch": 0.36, + "grad_norm": 1.1688436338075159, + "learning_rate": 1.4804364122290217e-05, + "loss": 0.5717, + "step": 4644 + }, + { + "epoch": 0.36, + "grad_norm": 1.200153346992501, + "learning_rate": 1.4802160283831233e-05, + "loss": 0.5803, + "step": 4645 + }, + { + "epoch": 0.36, + "grad_norm": 1.2668605942511664, + "learning_rate": 1.4799956142187094e-05, + "loss": 0.5679, + "step": 4646 + }, + { + "epoch": 0.36, + "grad_norm": 1.3013203279983399, + "learning_rate": 1.479775169749696e-05, + "loss": 0.6632, + "step": 4647 + }, + { + "epoch": 0.36, + "grad_norm": 1.156828297517851, + "learning_rate": 1.4795546949900006e-05, + "loss": 0.5417, + "step": 4648 + }, + { + "epoch": 0.36, + "grad_norm": 1.1721538466257775, + "learning_rate": 1.4793341899535434e-05, + "loss": 0.5572, + "step": 4649 + }, + { + "epoch": 0.36, + "grad_norm": 1.2127220540378292, + "learning_rate": 1.4791136546542454e-05, + "loss": 0.5855, + "step": 4650 + }, + { + "epoch": 0.36, + "grad_norm": 1.2433614881223698, + "learning_rate": 1.4788930891060307e-05, + "loss": 0.6232, + "step": 4651 + }, + { + "epoch": 0.36, + "grad_norm": 1.1928735921164484, + "learning_rate": 1.4786724933228247e-05, + "loss": 0.6087, + "step": 4652 + }, + { + "epoch": 0.36, + "grad_norm": 1.23211027580133, + "learning_rate": 1.4784518673185542e-05, + "loss": 0.6004, + "step": 4653 + }, + { + "epoch": 0.36, + "grad_norm": 1.157972907976523, + "learning_rate": 1.478231211107149e-05, + "loss": 0.5712, + "step": 4654 + }, + { + "epoch": 0.36, + "grad_norm": 1.1700557574818624, + "learning_rate": 1.47801052470254e-05, + "loss": 0.6394, + "step": 4655 + }, + { + "epoch": 0.36, + "grad_norm": 1.2900370295333148, + "learning_rate": 1.4777898081186606e-05, + "loss": 0.6146, + "step": 4656 + }, + { + "epoch": 0.36, + "grad_norm": 1.2731685705505695, + "learning_rate": 1.4775690613694453e-05, + "loss": 0.6077, + "step": 4657 + }, + { + "epoch": 0.36, + "grad_norm": 1.286985631618011, + "learning_rate": 1.4773482844688313e-05, + "loss": 0.6205, + "step": 4658 + }, + { + "epoch": 0.36, + "grad_norm": 1.0994185016086377, + "learning_rate": 1.4771274774307573e-05, + "loss": 0.5765, + "step": 4659 + }, + { + "epoch": 0.36, + "grad_norm": 1.2076429280106338, + "learning_rate": 1.4769066402691641e-05, + "loss": 0.6024, + "step": 4660 + }, + { + "epoch": 0.36, + "grad_norm": 1.2295042102519795, + "learning_rate": 1.476685772997994e-05, + "loss": 0.6074, + "step": 4661 + }, + { + "epoch": 0.36, + "grad_norm": 1.1001036161692197, + "learning_rate": 1.476464875631192e-05, + "loss": 0.5811, + "step": 4662 + }, + { + "epoch": 0.36, + "grad_norm": 1.3983361830850933, + "learning_rate": 1.4762439481827038e-05, + "loss": 0.6713, + "step": 4663 + }, + { + "epoch": 0.36, + "grad_norm": 1.1104922511634667, + "learning_rate": 1.4760229906664782e-05, + "loss": 0.5738, + "step": 4664 + }, + { + "epoch": 0.36, + "grad_norm": 1.1672524389460859, + "learning_rate": 1.4758020030964653e-05, + "loss": 0.6111, + "step": 4665 + }, + { + "epoch": 0.36, + "grad_norm": 1.2111903265080481, + "learning_rate": 1.4755809854866172e-05, + "loss": 0.6154, + "step": 4666 + }, + { + "epoch": 0.36, + "grad_norm": 1.0708957948292044, + "learning_rate": 1.4753599378508876e-05, + "loss": 0.5897, + "step": 4667 + }, + { + "epoch": 0.36, + "grad_norm": 1.1909672162711045, + "learning_rate": 1.4751388602032326e-05, + "loss": 0.585, + "step": 4668 + }, + { + "epoch": 0.36, + "grad_norm": 1.3186552492787873, + "learning_rate": 1.4749177525576102e-05, + "loss": 0.5289, + "step": 4669 + }, + { + "epoch": 0.36, + "grad_norm": 1.2254133247313488, + "learning_rate": 1.4746966149279796e-05, + "loss": 0.5357, + "step": 4670 + }, + { + "epoch": 0.36, + "grad_norm": 1.212856765249875, + "learning_rate": 1.4744754473283024e-05, + "loss": 0.6066, + "step": 4671 + }, + { + "epoch": 0.36, + "grad_norm": 1.2209873204334436, + "learning_rate": 1.4742542497725428e-05, + "loss": 0.5713, + "step": 4672 + }, + { + "epoch": 0.36, + "grad_norm": 1.1066103052145848, + "learning_rate": 1.4740330222746653e-05, + "loss": 0.5507, + "step": 4673 + }, + { + "epoch": 0.36, + "grad_norm": 1.056525938117344, + "learning_rate": 1.4738117648486375e-05, + "loss": 0.5389, + "step": 4674 + }, + { + "epoch": 0.36, + "grad_norm": 1.242217777291066, + "learning_rate": 1.473590477508428e-05, + "loss": 0.6362, + "step": 4675 + }, + { + "epoch": 0.36, + "grad_norm": 1.1351115607163516, + "learning_rate": 1.4733691602680088e-05, + "loss": 0.61, + "step": 4676 + }, + { + "epoch": 0.36, + "grad_norm": 1.10468385687717, + "learning_rate": 1.4731478131413519e-05, + "loss": 0.5112, + "step": 4677 + }, + { + "epoch": 0.36, + "grad_norm": 1.2354500350605704, + "learning_rate": 1.4729264361424325e-05, + "loss": 0.5876, + "step": 4678 + }, + { + "epoch": 0.36, + "grad_norm": 1.1107857022348495, + "learning_rate": 1.4727050292852272e-05, + "loss": 0.6043, + "step": 4679 + }, + { + "epoch": 0.36, + "grad_norm": 1.1257512445309525, + "learning_rate": 1.4724835925837146e-05, + "loss": 0.5526, + "step": 4680 + }, + { + "epoch": 0.36, + "grad_norm": 1.3035727667008492, + "learning_rate": 1.4722621260518752e-05, + "loss": 0.5747, + "step": 4681 + }, + { + "epoch": 0.36, + "grad_norm": 1.0787942440114195, + "learning_rate": 1.4720406297036913e-05, + "loss": 0.5057, + "step": 4682 + }, + { + "epoch": 0.36, + "grad_norm": 1.2150950279279153, + "learning_rate": 1.4718191035531468e-05, + "loss": 0.5375, + "step": 4683 + }, + { + "epoch": 0.36, + "grad_norm": 1.2089244229884872, + "learning_rate": 1.471597547614228e-05, + "loss": 0.5637, + "step": 4684 + }, + { + "epoch": 0.36, + "grad_norm": 1.140700925625432, + "learning_rate": 1.471375961900923e-05, + "loss": 0.5832, + "step": 4685 + }, + { + "epoch": 0.36, + "grad_norm": 1.1614544216939748, + "learning_rate": 1.4711543464272218e-05, + "loss": 0.5573, + "step": 4686 + }, + { + "epoch": 0.36, + "grad_norm": 1.2904417577530933, + "learning_rate": 1.4709327012071157e-05, + "loss": 0.5754, + "step": 4687 + }, + { + "epoch": 0.36, + "grad_norm": 1.1884137202052194, + "learning_rate": 1.4707110262545983e-05, + "loss": 0.5735, + "step": 4688 + }, + { + "epoch": 0.36, + "grad_norm": 1.3137574303268986, + "learning_rate": 1.4704893215836653e-05, + "loss": 0.6075, + "step": 4689 + }, + { + "epoch": 0.36, + "grad_norm": 1.1424801515486995, + "learning_rate": 1.4702675872083141e-05, + "loss": 0.591, + "step": 4690 + }, + { + "epoch": 0.36, + "grad_norm": 1.0066994723496627, + "learning_rate": 1.470045823142544e-05, + "loss": 0.5258, + "step": 4691 + }, + { + "epoch": 0.36, + "grad_norm": 1.0826355386328324, + "learning_rate": 1.469824029400356e-05, + "loss": 0.554, + "step": 4692 + }, + { + "epoch": 0.36, + "grad_norm": 1.171799059632538, + "learning_rate": 1.4696022059957527e-05, + "loss": 0.6025, + "step": 4693 + }, + { + "epoch": 0.36, + "grad_norm": 1.1649420059078481, + "learning_rate": 1.4693803529427393e-05, + "loss": 0.5831, + "step": 4694 + }, + { + "epoch": 0.36, + "grad_norm": 1.2874050234975287, + "learning_rate": 1.469158470255323e-05, + "loss": 0.5842, + "step": 4695 + }, + { + "epoch": 0.36, + "grad_norm": 1.1068601445084842, + "learning_rate": 1.4689365579475117e-05, + "loss": 0.5143, + "step": 4696 + }, + { + "epoch": 0.36, + "grad_norm": 1.2106024001596387, + "learning_rate": 1.4687146160333162e-05, + "loss": 0.582, + "step": 4697 + }, + { + "epoch": 0.36, + "grad_norm": 1.1885202943628286, + "learning_rate": 1.4684926445267485e-05, + "loss": 0.5579, + "step": 4698 + }, + { + "epoch": 0.36, + "grad_norm": 1.265046258163542, + "learning_rate": 1.4682706434418229e-05, + "loss": 0.6054, + "step": 4699 + }, + { + "epoch": 0.36, + "grad_norm": 1.1653156064217927, + "learning_rate": 1.468048612792556e-05, + "loss": 0.6027, + "step": 4700 + }, + { + "epoch": 0.36, + "grad_norm": 1.2136577446995616, + "learning_rate": 1.467826552592965e-05, + "loss": 0.584, + "step": 4701 + }, + { + "epoch": 0.36, + "grad_norm": 1.1824699581112545, + "learning_rate": 1.4676044628570707e-05, + "loss": 0.5522, + "step": 4702 + }, + { + "epoch": 0.36, + "grad_norm": 0.9981257338452493, + "learning_rate": 1.4673823435988933e-05, + "loss": 0.4888, + "step": 4703 + }, + { + "epoch": 0.36, + "grad_norm": 1.2072119716422292, + "learning_rate": 1.4671601948324577e-05, + "loss": 0.5952, + "step": 4704 + }, + { + "epoch": 0.37, + "grad_norm": 1.1882638983929963, + "learning_rate": 1.4669380165717889e-05, + "loss": 0.6087, + "step": 4705 + }, + { + "epoch": 0.37, + "grad_norm": 1.216398121752131, + "learning_rate": 1.4667158088309137e-05, + "loss": 0.6106, + "step": 4706 + }, + { + "epoch": 0.37, + "grad_norm": 1.187912919381898, + "learning_rate": 1.4664935716238615e-05, + "loss": 0.5989, + "step": 4707 + }, + { + "epoch": 0.37, + "grad_norm": 1.1497304476083048, + "learning_rate": 1.4662713049646637e-05, + "loss": 0.6008, + "step": 4708 + }, + { + "epoch": 0.37, + "grad_norm": 1.152337905093095, + "learning_rate": 1.4660490088673525e-05, + "loss": 0.56, + "step": 4709 + }, + { + "epoch": 0.37, + "grad_norm": 1.1848559306485862, + "learning_rate": 1.4658266833459629e-05, + "loss": 0.5928, + "step": 4710 + }, + { + "epoch": 0.37, + "grad_norm": 1.2017497835676911, + "learning_rate": 1.4656043284145316e-05, + "loss": 0.5771, + "step": 4711 + }, + { + "epoch": 0.37, + "grad_norm": 1.1954164646787175, + "learning_rate": 1.4653819440870965e-05, + "loss": 0.5681, + "step": 4712 + }, + { + "epoch": 0.37, + "grad_norm": 1.0869511326366013, + "learning_rate": 1.4651595303776986e-05, + "loss": 0.5519, + "step": 4713 + }, + { + "epoch": 0.37, + "grad_norm": 1.0623309618054928, + "learning_rate": 1.4649370873003794e-05, + "loss": 0.5548, + "step": 4714 + }, + { + "epoch": 0.37, + "grad_norm": 1.2850538404295495, + "learning_rate": 1.4647146148691831e-05, + "loss": 0.6135, + "step": 4715 + }, + { + "epoch": 0.37, + "grad_norm": 1.1221894017593557, + "learning_rate": 1.4644921130981558e-05, + "loss": 0.5488, + "step": 4716 + }, + { + "epoch": 0.37, + "grad_norm": 1.1878619395351635, + "learning_rate": 1.4642695820013446e-05, + "loss": 0.5899, + "step": 4717 + }, + { + "epoch": 0.37, + "grad_norm": 1.2727229448034345, + "learning_rate": 1.4640470215927998e-05, + "loss": 0.6163, + "step": 4718 + }, + { + "epoch": 0.37, + "grad_norm": 1.28863749578199, + "learning_rate": 1.463824431886572e-05, + "loss": 0.602, + "step": 4719 + }, + { + "epoch": 0.37, + "grad_norm": 1.12852978616599, + "learning_rate": 1.4636018128967149e-05, + "loss": 0.5947, + "step": 4720 + }, + { + "epoch": 0.37, + "grad_norm": 1.2562087835126583, + "learning_rate": 1.4633791646372837e-05, + "loss": 0.6434, + "step": 4721 + }, + { + "epoch": 0.37, + "grad_norm": 1.373995761009685, + "learning_rate": 1.4631564871223346e-05, + "loss": 0.5608, + "step": 4722 + }, + { + "epoch": 0.37, + "grad_norm": 1.223947982203224, + "learning_rate": 1.4629337803659274e-05, + "loss": 0.5634, + "step": 4723 + }, + { + "epoch": 0.37, + "grad_norm": 1.1323658029570405, + "learning_rate": 1.4627110443821217e-05, + "loss": 0.6322, + "step": 4724 + }, + { + "epoch": 0.37, + "grad_norm": 1.1576216011355829, + "learning_rate": 1.462488279184981e-05, + "loss": 0.5958, + "step": 4725 + }, + { + "epoch": 0.37, + "grad_norm": 1.1545356722938178, + "learning_rate": 1.4622654847885688e-05, + "loss": 0.5311, + "step": 4726 + }, + { + "epoch": 0.37, + "grad_norm": 1.1210583405435337, + "learning_rate": 1.4620426612069519e-05, + "loss": 0.5077, + "step": 4727 + }, + { + "epoch": 0.37, + "grad_norm": 1.1855321942949153, + "learning_rate": 1.4618198084541977e-05, + "loss": 0.6277, + "step": 4728 + }, + { + "epoch": 0.37, + "grad_norm": 1.137618190787337, + "learning_rate": 1.4615969265443762e-05, + "loss": 0.5715, + "step": 4729 + }, + { + "epoch": 0.37, + "grad_norm": 1.088656308405371, + "learning_rate": 1.4613740154915594e-05, + "loss": 0.497, + "step": 4730 + }, + { + "epoch": 0.37, + "grad_norm": 1.1511257652986857, + "learning_rate": 1.4611510753098208e-05, + "loss": 0.6052, + "step": 4731 + }, + { + "epoch": 0.37, + "grad_norm": 1.2175385738896742, + "learning_rate": 1.4609281060132352e-05, + "loss": 0.5224, + "step": 4732 + }, + { + "epoch": 0.37, + "grad_norm": 1.3204040495603833, + "learning_rate": 1.4607051076158805e-05, + "loss": 0.6724, + "step": 4733 + }, + { + "epoch": 0.37, + "grad_norm": 1.2755219886974971, + "learning_rate": 1.4604820801318351e-05, + "loss": 0.5868, + "step": 4734 + }, + { + "epoch": 0.37, + "grad_norm": 1.0580284608174968, + "learning_rate": 1.4602590235751806e-05, + "loss": 0.5534, + "step": 4735 + }, + { + "epoch": 0.37, + "grad_norm": 1.1920072946805294, + "learning_rate": 1.4600359379599992e-05, + "loss": 0.5588, + "step": 4736 + }, + { + "epoch": 0.37, + "grad_norm": 1.2067158110939797, + "learning_rate": 1.4598128233003754e-05, + "loss": 0.6165, + "step": 4737 + }, + { + "epoch": 0.37, + "grad_norm": 1.1458799988462935, + "learning_rate": 1.4595896796103959e-05, + "loss": 0.5861, + "step": 4738 + }, + { + "epoch": 0.37, + "grad_norm": 1.2734813565618301, + "learning_rate": 1.4593665069041484e-05, + "loss": 0.5554, + "step": 4739 + }, + { + "epoch": 0.37, + "grad_norm": 1.0933440953884168, + "learning_rate": 1.4591433051957237e-05, + "loss": 0.5109, + "step": 4740 + }, + { + "epoch": 0.37, + "grad_norm": 1.079024064401115, + "learning_rate": 1.4589200744992134e-05, + "loss": 0.5394, + "step": 4741 + }, + { + "epoch": 0.37, + "grad_norm": 1.1387230156591666, + "learning_rate": 1.4586968148287106e-05, + "loss": 0.5086, + "step": 4742 + }, + { + "epoch": 0.37, + "grad_norm": 1.1098926035158136, + "learning_rate": 1.4584735261983118e-05, + "loss": 0.5804, + "step": 4743 + }, + { + "epoch": 0.37, + "grad_norm": 1.2295573901278383, + "learning_rate": 1.4582502086221136e-05, + "loss": 0.5668, + "step": 4744 + }, + { + "epoch": 0.37, + "grad_norm": 1.1761754791295764, + "learning_rate": 1.4580268621142155e-05, + "loss": 0.5156, + "step": 4745 + }, + { + "epoch": 0.37, + "grad_norm": 1.1185849925845492, + "learning_rate": 1.4578034866887186e-05, + "loss": 0.5891, + "step": 4746 + }, + { + "epoch": 0.37, + "grad_norm": 1.169121979865239, + "learning_rate": 1.4575800823597255e-05, + "loss": 0.5588, + "step": 4747 + }, + { + "epoch": 0.37, + "grad_norm": 1.2061909339951176, + "learning_rate": 1.4573566491413409e-05, + "loss": 0.6009, + "step": 4748 + }, + { + "epoch": 0.37, + "grad_norm": 1.2298542244132797, + "learning_rate": 1.4571331870476716e-05, + "loss": 0.6244, + "step": 4749 + }, + { + "epoch": 0.37, + "grad_norm": 1.2345690514344285, + "learning_rate": 1.4569096960928255e-05, + "loss": 0.5856, + "step": 4750 + }, + { + "epoch": 0.37, + "grad_norm": 1.28085516451661, + "learning_rate": 1.4566861762909133e-05, + "loss": 0.596, + "step": 4751 + }, + { + "epoch": 0.37, + "grad_norm": 1.1875397023038996, + "learning_rate": 1.456462627656046e-05, + "loss": 0.5694, + "step": 4752 + }, + { + "epoch": 0.37, + "grad_norm": 1.2369710446810522, + "learning_rate": 1.4562390502023384e-05, + "loss": 0.6166, + "step": 4753 + }, + { + "epoch": 0.37, + "grad_norm": 1.2610003903575175, + "learning_rate": 1.4560154439439056e-05, + "loss": 0.6153, + "step": 4754 + }, + { + "epoch": 0.37, + "grad_norm": 1.1656802923674987, + "learning_rate": 1.4557918088948652e-05, + "loss": 0.5819, + "step": 4755 + }, + { + "epoch": 0.37, + "grad_norm": 1.0626824166276674, + "learning_rate": 1.4555681450693365e-05, + "loss": 0.5174, + "step": 4756 + }, + { + "epoch": 0.37, + "grad_norm": 1.0902383288288213, + "learning_rate": 1.45534445248144e-05, + "loss": 0.5509, + "step": 4757 + }, + { + "epoch": 0.37, + "grad_norm": 1.2193939757324839, + "learning_rate": 1.4551207311452991e-05, + "loss": 0.5316, + "step": 4758 + }, + { + "epoch": 0.37, + "grad_norm": 1.202331281154888, + "learning_rate": 1.4548969810750382e-05, + "loss": 0.6397, + "step": 4759 + }, + { + "epoch": 0.37, + "grad_norm": 1.1999417588882453, + "learning_rate": 1.454673202284784e-05, + "loss": 0.5375, + "step": 4760 + }, + { + "epoch": 0.37, + "grad_norm": 1.1422843881574958, + "learning_rate": 1.4544493947886648e-05, + "loss": 0.5942, + "step": 4761 + }, + { + "epoch": 0.37, + "grad_norm": 1.162060956186703, + "learning_rate": 1.4542255586008105e-05, + "loss": 0.5818, + "step": 4762 + }, + { + "epoch": 0.37, + "grad_norm": 1.1087109902281118, + "learning_rate": 1.4540016937353531e-05, + "loss": 0.5732, + "step": 4763 + }, + { + "epoch": 0.37, + "grad_norm": 1.1766382686945234, + "learning_rate": 1.4537778002064268e-05, + "loss": 0.5504, + "step": 4764 + }, + { + "epoch": 0.37, + "grad_norm": 1.1572959396233062, + "learning_rate": 1.4535538780281666e-05, + "loss": 0.5931, + "step": 4765 + }, + { + "epoch": 0.37, + "grad_norm": 1.191828718482068, + "learning_rate": 1.4533299272147103e-05, + "loss": 0.5953, + "step": 4766 + }, + { + "epoch": 0.37, + "grad_norm": 1.1207880008610662, + "learning_rate": 1.4531059477801965e-05, + "loss": 0.5145, + "step": 4767 + }, + { + "epoch": 0.37, + "grad_norm": 1.1240205739877187, + "learning_rate": 1.4528819397387663e-05, + "loss": 0.4981, + "step": 4768 + }, + { + "epoch": 0.37, + "grad_norm": 1.1965528547645943, + "learning_rate": 1.4526579031045631e-05, + "loss": 0.6224, + "step": 4769 + }, + { + "epoch": 0.37, + "grad_norm": 1.2179431689930071, + "learning_rate": 1.452433837891731e-05, + "loss": 0.6308, + "step": 4770 + }, + { + "epoch": 0.37, + "grad_norm": 1.1330384127846598, + "learning_rate": 1.4522097441144166e-05, + "loss": 0.5509, + "step": 4771 + }, + { + "epoch": 0.37, + "grad_norm": 1.1060478931011515, + "learning_rate": 1.4519856217867676e-05, + "loss": 0.5299, + "step": 4772 + }, + { + "epoch": 0.37, + "grad_norm": 1.1436129586922932, + "learning_rate": 1.4517614709229345e-05, + "loss": 0.5858, + "step": 4773 + }, + { + "epoch": 0.37, + "grad_norm": 1.175322433628499, + "learning_rate": 1.451537291537069e-05, + "loss": 0.5401, + "step": 4774 + }, + { + "epoch": 0.37, + "grad_norm": 1.2540039784978272, + "learning_rate": 1.4513130836433247e-05, + "loss": 0.6495, + "step": 4775 + }, + { + "epoch": 0.37, + "grad_norm": 1.2930560125770585, + "learning_rate": 1.451088847255857e-05, + "loss": 0.6149, + "step": 4776 + }, + { + "epoch": 0.37, + "grad_norm": 1.211144066696648, + "learning_rate": 1.4508645823888228e-05, + "loss": 0.6141, + "step": 4777 + }, + { + "epoch": 0.37, + "grad_norm": 1.1302796559601922, + "learning_rate": 1.4506402890563813e-05, + "loss": 0.5816, + "step": 4778 + }, + { + "epoch": 0.37, + "grad_norm": 1.2926972902345264, + "learning_rate": 1.4504159672726937e-05, + "loss": 0.6145, + "step": 4779 + }, + { + "epoch": 0.37, + "grad_norm": 1.201346978551093, + "learning_rate": 1.4501916170519221e-05, + "loss": 0.5835, + "step": 4780 + }, + { + "epoch": 0.37, + "grad_norm": 1.2820652949655547, + "learning_rate": 1.4499672384082312e-05, + "loss": 0.616, + "step": 4781 + }, + { + "epoch": 0.37, + "grad_norm": 1.250056027110474, + "learning_rate": 1.4497428313557866e-05, + "loss": 0.6359, + "step": 4782 + }, + { + "epoch": 0.37, + "grad_norm": 1.2056339904703068, + "learning_rate": 1.449518395908757e-05, + "loss": 0.581, + "step": 4783 + }, + { + "epoch": 0.37, + "grad_norm": 1.0803945122823422, + "learning_rate": 1.4492939320813117e-05, + "loss": 0.5485, + "step": 4784 + }, + { + "epoch": 0.37, + "grad_norm": 1.097617749390821, + "learning_rate": 1.4490694398876228e-05, + "loss": 0.5644, + "step": 4785 + }, + { + "epoch": 0.37, + "grad_norm": 1.2280019586870559, + "learning_rate": 1.448844919341863e-05, + "loss": 0.5822, + "step": 4786 + }, + { + "epoch": 0.37, + "grad_norm": 1.1724804649271054, + "learning_rate": 1.4486203704582075e-05, + "loss": 0.5991, + "step": 4787 + }, + { + "epoch": 0.37, + "grad_norm": 1.2577953455922122, + "learning_rate": 1.4483957932508338e-05, + "loss": 0.6295, + "step": 4788 + }, + { + "epoch": 0.37, + "grad_norm": 1.3325087958183344, + "learning_rate": 1.4481711877339202e-05, + "loss": 0.66, + "step": 4789 + }, + { + "epoch": 0.37, + "grad_norm": 1.1686819699321465, + "learning_rate": 1.447946553921647e-05, + "loss": 0.5606, + "step": 4790 + }, + { + "epoch": 0.37, + "grad_norm": 1.1877135034669553, + "learning_rate": 1.4477218918281967e-05, + "loss": 0.5913, + "step": 4791 + }, + { + "epoch": 0.37, + "grad_norm": 1.0846278574643111, + "learning_rate": 1.4474972014677537e-05, + "loss": 0.5443, + "step": 4792 + }, + { + "epoch": 0.37, + "grad_norm": 1.1014566573569242, + "learning_rate": 1.4472724828545035e-05, + "loss": 0.5775, + "step": 4793 + }, + { + "epoch": 0.37, + "grad_norm": 1.0747883344345852, + "learning_rate": 1.447047736002634e-05, + "loss": 0.5655, + "step": 4794 + }, + { + "epoch": 0.37, + "grad_norm": 1.2638569951254384, + "learning_rate": 1.4468229609263343e-05, + "loss": 0.592, + "step": 4795 + }, + { + "epoch": 0.37, + "grad_norm": 1.0124472569044305, + "learning_rate": 1.4465981576397957e-05, + "loss": 0.5514, + "step": 4796 + }, + { + "epoch": 0.37, + "grad_norm": 1.3247691223238744, + "learning_rate": 1.4463733261572114e-05, + "loss": 0.6501, + "step": 4797 + }, + { + "epoch": 0.37, + "grad_norm": 1.1495173042491573, + "learning_rate": 1.4461484664927758e-05, + "loss": 0.6023, + "step": 4798 + }, + { + "epoch": 0.37, + "grad_norm": 1.1215567347223456, + "learning_rate": 1.4459235786606861e-05, + "loss": 0.5399, + "step": 4799 + }, + { + "epoch": 0.37, + "grad_norm": 1.2231811274309439, + "learning_rate": 1.44569866267514e-05, + "loss": 0.5888, + "step": 4800 + }, + { + "epoch": 0.37, + "grad_norm": 1.2151362812078543, + "learning_rate": 1.4454737185503375e-05, + "loss": 0.5955, + "step": 4801 + }, + { + "epoch": 0.37, + "grad_norm": 1.082043864593238, + "learning_rate": 1.4452487463004815e-05, + "loss": 0.5976, + "step": 4802 + }, + { + "epoch": 0.37, + "grad_norm": 1.0975717532396327, + "learning_rate": 1.4450237459397742e-05, + "loss": 0.5471, + "step": 4803 + }, + { + "epoch": 0.37, + "grad_norm": 1.2611125517280033, + "learning_rate": 1.4447987174824225e-05, + "loss": 0.5977, + "step": 4804 + }, + { + "epoch": 0.37, + "grad_norm": 1.042119970235621, + "learning_rate": 1.4445736609426324e-05, + "loss": 0.5044, + "step": 4805 + }, + { + "epoch": 0.37, + "grad_norm": 1.2066174634277038, + "learning_rate": 1.4443485763346135e-05, + "loss": 0.6184, + "step": 4806 + }, + { + "epoch": 0.37, + "grad_norm": 1.081636763180403, + "learning_rate": 1.4441234636725767e-05, + "loss": 0.5276, + "step": 4807 + }, + { + "epoch": 0.37, + "grad_norm": 1.1733889654817238, + "learning_rate": 1.4438983229707338e-05, + "loss": 0.5471, + "step": 4808 + }, + { + "epoch": 0.37, + "grad_norm": 1.165812258980968, + "learning_rate": 1.4436731542433e-05, + "loss": 0.5866, + "step": 4809 + }, + { + "epoch": 0.37, + "grad_norm": 1.277588847032594, + "learning_rate": 1.4434479575044908e-05, + "loss": 0.5726, + "step": 4810 + }, + { + "epoch": 0.37, + "grad_norm": 1.3050374001591714, + "learning_rate": 1.443222732768524e-05, + "loss": 0.6454, + "step": 4811 + }, + { + "epoch": 0.37, + "grad_norm": 1.185846331764837, + "learning_rate": 1.4429974800496194e-05, + "loss": 0.563, + "step": 4812 + }, + { + "epoch": 0.37, + "grad_norm": 1.205603239378219, + "learning_rate": 1.4427721993619983e-05, + "loss": 0.5707, + "step": 4813 + }, + { + "epoch": 0.37, + "grad_norm": 1.2266679949252222, + "learning_rate": 1.4425468907198843e-05, + "loss": 0.5924, + "step": 4814 + }, + { + "epoch": 0.37, + "grad_norm": 1.149111387837983, + "learning_rate": 1.4423215541375013e-05, + "loss": 0.5273, + "step": 4815 + }, + { + "epoch": 0.37, + "grad_norm": 1.1426260652836573, + "learning_rate": 1.4420961896290764e-05, + "loss": 0.5476, + "step": 4816 + }, + { + "epoch": 0.37, + "grad_norm": 1.303687711930291, + "learning_rate": 1.4418707972088386e-05, + "loss": 0.7074, + "step": 4817 + }, + { + "epoch": 0.37, + "grad_norm": 1.138059840926854, + "learning_rate": 1.4416453768910173e-05, + "loss": 0.5787, + "step": 4818 + }, + { + "epoch": 0.37, + "grad_norm": 1.1330971721886915, + "learning_rate": 1.4414199286898449e-05, + "loss": 0.5522, + "step": 4819 + }, + { + "epoch": 0.37, + "grad_norm": 1.2630363646925478, + "learning_rate": 1.4411944526195551e-05, + "loss": 0.608, + "step": 4820 + }, + { + "epoch": 0.37, + "grad_norm": 1.1907944906550014, + "learning_rate": 1.4409689486943829e-05, + "loss": 0.5857, + "step": 4821 + }, + { + "epoch": 0.37, + "grad_norm": 1.2335958315328632, + "learning_rate": 1.4407434169285664e-05, + "loss": 0.5955, + "step": 4822 + }, + { + "epoch": 0.37, + "grad_norm": 1.1670974492439778, + "learning_rate": 1.4405178573363435e-05, + "loss": 0.5905, + "step": 4823 + }, + { + "epoch": 0.37, + "grad_norm": 1.194352506209174, + "learning_rate": 1.4402922699319557e-05, + "loss": 0.5869, + "step": 4824 + }, + { + "epoch": 0.37, + "grad_norm": 1.102264180816152, + "learning_rate": 1.4400666547296456e-05, + "loss": 0.5743, + "step": 4825 + }, + { + "epoch": 0.37, + "grad_norm": 1.0537866206384232, + "learning_rate": 1.4398410117436566e-05, + "loss": 0.4935, + "step": 4826 + }, + { + "epoch": 0.37, + "grad_norm": 1.1581988535604466, + "learning_rate": 1.4396153409882356e-05, + "loss": 0.5872, + "step": 4827 + }, + { + "epoch": 0.37, + "grad_norm": 1.1849326941936331, + "learning_rate": 1.4393896424776296e-05, + "loss": 0.6047, + "step": 4828 + }, + { + "epoch": 0.37, + "grad_norm": 1.1805735917053748, + "learning_rate": 1.439163916226089e-05, + "loss": 0.5915, + "step": 4829 + }, + { + "epoch": 0.37, + "grad_norm": 1.1801905790891936, + "learning_rate": 1.4389381622478644e-05, + "loss": 0.5934, + "step": 4830 + }, + { + "epoch": 0.37, + "grad_norm": 1.1340017456027796, + "learning_rate": 1.438712380557209e-05, + "loss": 0.5643, + "step": 4831 + }, + { + "epoch": 0.37, + "grad_norm": 1.1279649869341102, + "learning_rate": 1.4384865711683778e-05, + "loss": 0.5166, + "step": 4832 + }, + { + "epoch": 0.37, + "grad_norm": 1.099863189078922, + "learning_rate": 1.4382607340956265e-05, + "loss": 0.5906, + "step": 4833 + }, + { + "epoch": 0.38, + "grad_norm": 1.1926667596270755, + "learning_rate": 1.4380348693532144e-05, + "loss": 0.6094, + "step": 4834 + }, + { + "epoch": 0.38, + "grad_norm": 1.2270666167713262, + "learning_rate": 1.4378089769554009e-05, + "loss": 0.6197, + "step": 4835 + }, + { + "epoch": 0.38, + "grad_norm": 1.2928307220450972, + "learning_rate": 1.4375830569164478e-05, + "loss": 0.6853, + "step": 4836 + }, + { + "epoch": 0.38, + "grad_norm": 1.1981011566511857, + "learning_rate": 1.4373571092506189e-05, + "loss": 0.5279, + "step": 4837 + }, + { + "epoch": 0.38, + "grad_norm": 1.0756776093512608, + "learning_rate": 1.437131133972179e-05, + "loss": 0.4926, + "step": 4838 + }, + { + "epoch": 0.38, + "grad_norm": 1.200482029263595, + "learning_rate": 1.4369051310953954e-05, + "loss": 0.5534, + "step": 4839 + }, + { + "epoch": 0.38, + "grad_norm": 1.1552876256360114, + "learning_rate": 1.436679100634537e-05, + "loss": 0.5862, + "step": 4840 + }, + { + "epoch": 0.38, + "grad_norm": 1.2420653277122962, + "learning_rate": 1.4364530426038734e-05, + "loss": 0.6291, + "step": 4841 + }, + { + "epoch": 0.38, + "grad_norm": 1.114312908605802, + "learning_rate": 1.436226957017678e-05, + "loss": 0.5336, + "step": 4842 + }, + { + "epoch": 0.38, + "grad_norm": 1.1131179422093747, + "learning_rate": 1.436000843890224e-05, + "loss": 0.5634, + "step": 4843 + }, + { + "epoch": 0.38, + "grad_norm": 1.042571545805598, + "learning_rate": 1.435774703235787e-05, + "loss": 0.5255, + "step": 4844 + }, + { + "epoch": 0.38, + "grad_norm": 1.1318485762883639, + "learning_rate": 1.4355485350686449e-05, + "loss": 0.5397, + "step": 4845 + }, + { + "epoch": 0.38, + "grad_norm": 1.2434428847034074, + "learning_rate": 1.4353223394030767e-05, + "loss": 0.6402, + "step": 4846 + }, + { + "epoch": 0.38, + "grad_norm": 1.1766315819874225, + "learning_rate": 1.4350961162533627e-05, + "loss": 0.5766, + "step": 4847 + }, + { + "epoch": 0.38, + "grad_norm": 1.2268830384716451, + "learning_rate": 1.434869865633787e-05, + "loss": 0.5781, + "step": 4848 + }, + { + "epoch": 0.38, + "grad_norm": 1.2116030433056135, + "learning_rate": 1.4346435875586324e-05, + "loss": 0.6044, + "step": 4849 + }, + { + "epoch": 0.38, + "grad_norm": 1.1843054114358138, + "learning_rate": 1.434417282042186e-05, + "loss": 0.5525, + "step": 4850 + }, + { + "epoch": 0.38, + "grad_norm": 1.1525653690658018, + "learning_rate": 1.434190949098735e-05, + "loss": 0.5536, + "step": 4851 + }, + { + "epoch": 0.38, + "grad_norm": 1.2550317104482467, + "learning_rate": 1.4339645887425693e-05, + "loss": 0.6192, + "step": 4852 + }, + { + "epoch": 0.38, + "grad_norm": 1.2371649779919311, + "learning_rate": 1.4337382009879806e-05, + "loss": 0.5949, + "step": 4853 + }, + { + "epoch": 0.38, + "grad_norm": 1.1263838839364222, + "learning_rate": 1.433511785849261e-05, + "loss": 0.5938, + "step": 4854 + }, + { + "epoch": 0.38, + "grad_norm": 1.234956628447132, + "learning_rate": 1.433285343340706e-05, + "loss": 0.627, + "step": 4855 + }, + { + "epoch": 0.38, + "grad_norm": 1.0569472229484256, + "learning_rate": 1.4330588734766113e-05, + "loss": 0.5622, + "step": 4856 + }, + { + "epoch": 0.38, + "grad_norm": 1.2879857045262912, + "learning_rate": 1.432832376271276e-05, + "loss": 0.5922, + "step": 4857 + }, + { + "epoch": 0.38, + "grad_norm": 1.107385759252496, + "learning_rate": 1.4326058517389998e-05, + "loss": 0.5996, + "step": 4858 + }, + { + "epoch": 0.38, + "grad_norm": 1.2098262487143545, + "learning_rate": 1.432379299894084e-05, + "loss": 0.5246, + "step": 4859 + }, + { + "epoch": 0.38, + "grad_norm": 1.183172220133152, + "learning_rate": 1.4321527207508324e-05, + "loss": 0.6075, + "step": 4860 + }, + { + "epoch": 0.38, + "grad_norm": 1.2062395579622367, + "learning_rate": 1.4319261143235496e-05, + "loss": 0.6014, + "step": 4861 + }, + { + "epoch": 0.38, + "grad_norm": 1.2227280080779541, + "learning_rate": 1.431699480626543e-05, + "loss": 0.6177, + "step": 4862 + }, + { + "epoch": 0.38, + "grad_norm": 1.1390181942291235, + "learning_rate": 1.431472819674121e-05, + "loss": 0.6039, + "step": 4863 + }, + { + "epoch": 0.38, + "grad_norm": 1.2663982054580196, + "learning_rate": 1.4312461314805936e-05, + "loss": 0.6207, + "step": 4864 + }, + { + "epoch": 0.38, + "grad_norm": 1.099522588906349, + "learning_rate": 1.431019416060273e-05, + "loss": 0.5091, + "step": 4865 + }, + { + "epoch": 0.38, + "grad_norm": 1.1178514868272384, + "learning_rate": 1.430792673427473e-05, + "loss": 0.5329, + "step": 4866 + }, + { + "epoch": 0.38, + "grad_norm": 1.1970950529430149, + "learning_rate": 1.4305659035965087e-05, + "loss": 0.5721, + "step": 4867 + }, + { + "epoch": 0.38, + "grad_norm": 1.1873983038472684, + "learning_rate": 1.430339106581698e-05, + "loss": 0.5644, + "step": 4868 + }, + { + "epoch": 0.38, + "grad_norm": 1.2634657819980597, + "learning_rate": 1.4301122823973588e-05, + "loss": 0.5808, + "step": 4869 + }, + { + "epoch": 0.38, + "grad_norm": 1.1288353392825703, + "learning_rate": 1.429885431057812e-05, + "loss": 0.5662, + "step": 4870 + }, + { + "epoch": 0.38, + "grad_norm": 1.1901173608272502, + "learning_rate": 1.4296585525773803e-05, + "loss": 0.6045, + "step": 4871 + }, + { + "epoch": 0.38, + "grad_norm": 1.1990183808872086, + "learning_rate": 1.4294316469703873e-05, + "loss": 0.5554, + "step": 4872 + }, + { + "epoch": 0.38, + "grad_norm": 1.162629954960051, + "learning_rate": 1.429204714251159e-05, + "loss": 0.5556, + "step": 4873 + }, + { + "epoch": 0.38, + "grad_norm": 1.0618223946727976, + "learning_rate": 1.4289777544340227e-05, + "loss": 0.6083, + "step": 4874 + }, + { + "epoch": 0.38, + "grad_norm": 1.2307555818030167, + "learning_rate": 1.4287507675333075e-05, + "loss": 0.6497, + "step": 4875 + }, + { + "epoch": 0.38, + "grad_norm": 1.1254117530076173, + "learning_rate": 1.4285237535633442e-05, + "loss": 0.5986, + "step": 4876 + }, + { + "epoch": 0.38, + "grad_norm": 1.1017090179182312, + "learning_rate": 1.4282967125384652e-05, + "loss": 0.5943, + "step": 4877 + }, + { + "epoch": 0.38, + "grad_norm": 1.1277232059550661, + "learning_rate": 1.4280696444730057e-05, + "loss": 0.5673, + "step": 4878 + }, + { + "epoch": 0.38, + "grad_norm": 1.217047553776633, + "learning_rate": 1.4278425493813004e-05, + "loss": 0.6337, + "step": 4879 + }, + { + "epoch": 0.38, + "grad_norm": 1.1508302739015464, + "learning_rate": 1.4276154272776876e-05, + "loss": 0.5765, + "step": 4880 + }, + { + "epoch": 0.38, + "grad_norm": 1.1430070642501653, + "learning_rate": 1.4273882781765069e-05, + "loss": 0.5336, + "step": 4881 + }, + { + "epoch": 0.38, + "grad_norm": 1.0953040390667381, + "learning_rate": 1.427161102092099e-05, + "loss": 0.5424, + "step": 4882 + }, + { + "epoch": 0.38, + "grad_norm": 1.2456077177893994, + "learning_rate": 1.4269338990388072e-05, + "loss": 0.5926, + "step": 4883 + }, + { + "epoch": 0.38, + "grad_norm": 1.157212140601062, + "learning_rate": 1.4267066690309754e-05, + "loss": 0.5766, + "step": 4884 + }, + { + "epoch": 0.38, + "grad_norm": 1.2686799936758892, + "learning_rate": 1.4264794120829499e-05, + "loss": 0.6392, + "step": 4885 + }, + { + "epoch": 0.38, + "grad_norm": 1.3063366455106868, + "learning_rate": 1.4262521282090791e-05, + "loss": 0.6272, + "step": 4886 + }, + { + "epoch": 0.38, + "grad_norm": 1.0878681064103706, + "learning_rate": 1.4260248174237121e-05, + "loss": 0.6112, + "step": 4887 + }, + { + "epoch": 0.38, + "grad_norm": 1.1843401377838811, + "learning_rate": 1.4257974797412006e-05, + "loss": 0.6256, + "step": 4888 + }, + { + "epoch": 0.38, + "grad_norm": 1.2296357256642696, + "learning_rate": 1.4255701151758972e-05, + "loss": 0.6256, + "step": 4889 + }, + { + "epoch": 0.38, + "grad_norm": 1.0200358344309548, + "learning_rate": 1.4253427237421567e-05, + "loss": 0.5213, + "step": 4890 + }, + { + "epoch": 0.38, + "grad_norm": 1.2829383797628846, + "learning_rate": 1.4251153054543357e-05, + "loss": 0.6187, + "step": 4891 + }, + { + "epoch": 0.38, + "grad_norm": 1.0860861806834865, + "learning_rate": 1.4248878603267922e-05, + "loss": 0.5749, + "step": 4892 + }, + { + "epoch": 0.38, + "grad_norm": 1.1488703735842798, + "learning_rate": 1.4246603883738859e-05, + "loss": 0.6043, + "step": 4893 + }, + { + "epoch": 0.38, + "grad_norm": 1.1044169568142737, + "learning_rate": 1.4244328896099782e-05, + "loss": 0.5685, + "step": 4894 + }, + { + "epoch": 0.38, + "grad_norm": 1.1651589778762321, + "learning_rate": 1.4242053640494322e-05, + "loss": 0.5881, + "step": 4895 + }, + { + "epoch": 0.38, + "grad_norm": 1.1452661960291812, + "learning_rate": 1.4239778117066132e-05, + "loss": 0.5642, + "step": 4896 + }, + { + "epoch": 0.38, + "grad_norm": 1.048361903732733, + "learning_rate": 1.4237502325958876e-05, + "loss": 0.5737, + "step": 4897 + }, + { + "epoch": 0.38, + "grad_norm": 1.2277367189186532, + "learning_rate": 1.4235226267316234e-05, + "loss": 0.5551, + "step": 4898 + }, + { + "epoch": 0.38, + "grad_norm": 1.3070099313004027, + "learning_rate": 1.4232949941281908e-05, + "loss": 0.6612, + "step": 4899 + }, + { + "epoch": 0.38, + "grad_norm": 1.2529759268430862, + "learning_rate": 1.4230673347999612e-05, + "loss": 0.627, + "step": 4900 + }, + { + "epoch": 0.38, + "grad_norm": 1.2075407563907425, + "learning_rate": 1.4228396487613081e-05, + "loss": 0.6432, + "step": 4901 + }, + { + "epoch": 0.38, + "grad_norm": 1.125172972627046, + "learning_rate": 1.4226119360266065e-05, + "loss": 0.5594, + "step": 4902 + }, + { + "epoch": 0.38, + "grad_norm": 1.3071368861676242, + "learning_rate": 1.4223841966102327e-05, + "loss": 0.6807, + "step": 4903 + }, + { + "epoch": 0.38, + "grad_norm": 1.0771641526930442, + "learning_rate": 1.4221564305265657e-05, + "loss": 0.56, + "step": 4904 + }, + { + "epoch": 0.38, + "grad_norm": 1.1093863365776122, + "learning_rate": 1.4219286377899848e-05, + "loss": 0.5597, + "step": 4905 + }, + { + "epoch": 0.38, + "grad_norm": 1.1987146069629033, + "learning_rate": 1.4217008184148727e-05, + "loss": 0.5724, + "step": 4906 + }, + { + "epoch": 0.38, + "grad_norm": 1.3141783473854975, + "learning_rate": 1.4214729724156118e-05, + "loss": 0.6102, + "step": 4907 + }, + { + "epoch": 0.38, + "grad_norm": 1.2733609697245454, + "learning_rate": 1.4212450998065876e-05, + "loss": 0.6545, + "step": 4908 + }, + { + "epoch": 0.38, + "grad_norm": 1.1138952870765229, + "learning_rate": 1.4210172006021872e-05, + "loss": 0.5782, + "step": 4909 + }, + { + "epoch": 0.38, + "grad_norm": 1.0836727759843126, + "learning_rate": 1.4207892748167985e-05, + "loss": 0.5971, + "step": 4910 + }, + { + "epoch": 0.38, + "grad_norm": 1.1116477359225532, + "learning_rate": 1.420561322464812e-05, + "loss": 0.5668, + "step": 4911 + }, + { + "epoch": 0.38, + "grad_norm": 1.188494466679667, + "learning_rate": 1.4203333435606196e-05, + "loss": 0.5782, + "step": 4912 + }, + { + "epoch": 0.38, + "grad_norm": 1.276618728426569, + "learning_rate": 1.4201053381186141e-05, + "loss": 0.5927, + "step": 4913 + }, + { + "epoch": 0.38, + "grad_norm": 1.1233440503553442, + "learning_rate": 1.4198773061531917e-05, + "loss": 0.5638, + "step": 4914 + }, + { + "epoch": 0.38, + "grad_norm": 0.9997992612106881, + "learning_rate": 1.4196492476787482e-05, + "loss": 0.5276, + "step": 4915 + }, + { + "epoch": 0.38, + "grad_norm": 1.2454187843217115, + "learning_rate": 1.419421162709683e-05, + "loss": 0.6094, + "step": 4916 + }, + { + "epoch": 0.38, + "grad_norm": 1.1780997701113802, + "learning_rate": 1.4191930512603956e-05, + "loss": 0.5681, + "step": 4917 + }, + { + "epoch": 0.38, + "grad_norm": 1.295072060645785, + "learning_rate": 1.4189649133452881e-05, + "loss": 0.5976, + "step": 4918 + }, + { + "epoch": 0.38, + "grad_norm": 1.0245101200011653, + "learning_rate": 1.4187367489787642e-05, + "loss": 0.5122, + "step": 4919 + }, + { + "epoch": 0.38, + "grad_norm": 1.2923113024446344, + "learning_rate": 1.4185085581752289e-05, + "loss": 0.5874, + "step": 4920 + }, + { + "epoch": 0.38, + "grad_norm": 1.1116119719513675, + "learning_rate": 1.4182803409490891e-05, + "loss": 0.59, + "step": 4921 + }, + { + "epoch": 0.38, + "grad_norm": 1.260335346887879, + "learning_rate": 1.4180520973147534e-05, + "loss": 0.6102, + "step": 4922 + }, + { + "epoch": 0.38, + "grad_norm": 1.2737362283068865, + "learning_rate": 1.4178238272866316e-05, + "loss": 0.5511, + "step": 4923 + }, + { + "epoch": 0.38, + "grad_norm": 1.1566248105818988, + "learning_rate": 1.4175955308791363e-05, + "loss": 0.5751, + "step": 4924 + }, + { + "epoch": 0.38, + "grad_norm": 1.2336226476240444, + "learning_rate": 1.4173672081066806e-05, + "loss": 0.569, + "step": 4925 + }, + { + "epoch": 0.38, + "grad_norm": 1.2475597881751865, + "learning_rate": 1.4171388589836792e-05, + "loss": 0.5867, + "step": 4926 + }, + { + "epoch": 0.38, + "grad_norm": 1.0598683584090418, + "learning_rate": 1.4169104835245502e-05, + "loss": 0.5513, + "step": 4927 + }, + { + "epoch": 0.38, + "grad_norm": 1.2302143621122017, + "learning_rate": 1.416682081743711e-05, + "loss": 0.6471, + "step": 4928 + }, + { + "epoch": 0.38, + "grad_norm": 1.239611515825394, + "learning_rate": 1.4164536536555824e-05, + "loss": 0.5612, + "step": 4929 + }, + { + "epoch": 0.38, + "grad_norm": 1.1506194063679558, + "learning_rate": 1.4162251992745858e-05, + "loss": 0.5476, + "step": 4930 + }, + { + "epoch": 0.38, + "grad_norm": 1.1681051015205106, + "learning_rate": 1.4159967186151446e-05, + "loss": 0.6005, + "step": 4931 + }, + { + "epoch": 0.38, + "grad_norm": 1.3163056377595899, + "learning_rate": 1.4157682116916851e-05, + "loss": 0.6112, + "step": 4932 + }, + { + "epoch": 0.38, + "grad_norm": 1.256711584666228, + "learning_rate": 1.415539678518633e-05, + "loss": 0.6149, + "step": 4933 + }, + { + "epoch": 0.38, + "grad_norm": 1.202299007899473, + "learning_rate": 1.415311119110417e-05, + "loss": 0.6107, + "step": 4934 + }, + { + "epoch": 0.38, + "grad_norm": 1.3072290392866028, + "learning_rate": 1.415082533481467e-05, + "loss": 0.6436, + "step": 4935 + }, + { + "epoch": 0.38, + "grad_norm": 1.1667086843462804, + "learning_rate": 1.4148539216462153e-05, + "loss": 0.6443, + "step": 4936 + }, + { + "epoch": 0.38, + "grad_norm": 1.1747074229628005, + "learning_rate": 1.4146252836190958e-05, + "loss": 0.5495, + "step": 4937 + }, + { + "epoch": 0.38, + "grad_norm": 1.2530183589774615, + "learning_rate": 1.4143966194145424e-05, + "loss": 0.6058, + "step": 4938 + }, + { + "epoch": 0.38, + "grad_norm": 1.0756300654449973, + "learning_rate": 1.4141679290469925e-05, + "loss": 0.5889, + "step": 4939 + }, + { + "epoch": 0.38, + "grad_norm": 1.251629482577088, + "learning_rate": 1.4139392125308842e-05, + "loss": 0.643, + "step": 4940 + }, + { + "epoch": 0.38, + "grad_norm": 1.1757323803044788, + "learning_rate": 1.413710469880658e-05, + "loss": 0.5421, + "step": 4941 + }, + { + "epoch": 0.38, + "grad_norm": 1.2961164807056123, + "learning_rate": 1.4134817011107555e-05, + "loss": 0.5945, + "step": 4942 + }, + { + "epoch": 0.38, + "grad_norm": 1.155718913411746, + "learning_rate": 1.4132529062356197e-05, + "loss": 0.5825, + "step": 4943 + }, + { + "epoch": 0.38, + "grad_norm": 1.1206681776265388, + "learning_rate": 1.4130240852696958e-05, + "loss": 0.5698, + "step": 4944 + }, + { + "epoch": 0.38, + "grad_norm": 1.0634337417486805, + "learning_rate": 1.4127952382274305e-05, + "loss": 0.5843, + "step": 4945 + }, + { + "epoch": 0.38, + "grad_norm": 1.1516916790043479, + "learning_rate": 1.4125663651232725e-05, + "loss": 0.5607, + "step": 4946 + }, + { + "epoch": 0.38, + "grad_norm": 1.188723084860073, + "learning_rate": 1.4123374659716708e-05, + "loss": 0.5515, + "step": 4947 + }, + { + "epoch": 0.38, + "grad_norm": 1.1545604527515752, + "learning_rate": 1.4121085407870776e-05, + "loss": 0.5645, + "step": 4948 + }, + { + "epoch": 0.38, + "grad_norm": 1.04304921272021, + "learning_rate": 1.411879589583946e-05, + "loss": 0.5281, + "step": 4949 + }, + { + "epoch": 0.38, + "grad_norm": 1.142491942195733, + "learning_rate": 1.4116506123767308e-05, + "loss": 0.5592, + "step": 4950 + }, + { + "epoch": 0.38, + "grad_norm": 1.1670457133383783, + "learning_rate": 1.411421609179889e-05, + "loss": 0.5679, + "step": 4951 + }, + { + "epoch": 0.38, + "grad_norm": 1.1221702272237901, + "learning_rate": 1.411192580007878e-05, + "loss": 0.5746, + "step": 4952 + }, + { + "epoch": 0.38, + "grad_norm": 1.189275167648375, + "learning_rate": 1.410963524875158e-05, + "loss": 0.5174, + "step": 4953 + }, + { + "epoch": 0.38, + "grad_norm": 1.2529479075271828, + "learning_rate": 1.4107344437961902e-05, + "loss": 0.5678, + "step": 4954 + }, + { + "epoch": 0.38, + "grad_norm": 1.2215801536941462, + "learning_rate": 1.4105053367854382e-05, + "loss": 0.6142, + "step": 4955 + }, + { + "epoch": 0.38, + "grad_norm": 1.1335327489570246, + "learning_rate": 1.4102762038573663e-05, + "loss": 0.5655, + "step": 4956 + }, + { + "epoch": 0.38, + "grad_norm": 1.2208438883684272, + "learning_rate": 1.410047045026441e-05, + "loss": 0.6312, + "step": 4957 + }, + { + "epoch": 0.38, + "grad_norm": 1.177590736246212, + "learning_rate": 1.40981786030713e-05, + "loss": 0.5922, + "step": 4958 + }, + { + "epoch": 0.38, + "grad_norm": 1.097529556727991, + "learning_rate": 1.4095886497139029e-05, + "loss": 0.5457, + "step": 4959 + }, + { + "epoch": 0.38, + "grad_norm": 1.1193182306681342, + "learning_rate": 1.4093594132612314e-05, + "loss": 0.4991, + "step": 4960 + }, + { + "epoch": 0.38, + "grad_norm": 1.1690475941767988, + "learning_rate": 1.4091301509635882e-05, + "loss": 0.533, + "step": 4961 + }, + { + "epoch": 0.38, + "grad_norm": 1.168178067540347, + "learning_rate": 1.4089008628354477e-05, + "loss": 0.6165, + "step": 4962 + }, + { + "epoch": 0.39, + "grad_norm": 1.1656709861377175, + "learning_rate": 1.4086715488912858e-05, + "loss": 0.6242, + "step": 4963 + }, + { + "epoch": 0.39, + "grad_norm": 1.2235963272537176, + "learning_rate": 1.4084422091455808e-05, + "loss": 0.6116, + "step": 4964 + }, + { + "epoch": 0.39, + "grad_norm": 1.2199280254051401, + "learning_rate": 1.408212843612812e-05, + "loss": 0.5957, + "step": 4965 + }, + { + "epoch": 0.39, + "grad_norm": 1.090983625325818, + "learning_rate": 1.40798345230746e-05, + "loss": 0.571, + "step": 4966 + }, + { + "epoch": 0.39, + "grad_norm": 1.1602652142149223, + "learning_rate": 1.407754035244008e-05, + "loss": 0.5741, + "step": 4967 + }, + { + "epoch": 0.39, + "grad_norm": 1.1282554783484182, + "learning_rate": 1.4075245924369397e-05, + "loss": 0.5207, + "step": 4968 + }, + { + "epoch": 0.39, + "grad_norm": 1.1144929950266353, + "learning_rate": 1.4072951239007414e-05, + "loss": 0.5441, + "step": 4969 + }, + { + "epoch": 0.39, + "grad_norm": 1.2034443518147127, + "learning_rate": 1.4070656296499006e-05, + "loss": 0.6181, + "step": 4970 + }, + { + "epoch": 0.39, + "grad_norm": 1.2046234410691394, + "learning_rate": 1.4068361096989064e-05, + "loss": 0.5711, + "step": 4971 + }, + { + "epoch": 0.39, + "grad_norm": 1.270449403186937, + "learning_rate": 1.4066065640622499e-05, + "loss": 0.6, + "step": 4972 + }, + { + "epoch": 0.39, + "grad_norm": 1.1815090137675432, + "learning_rate": 1.4063769927544228e-05, + "loss": 0.5679, + "step": 4973 + }, + { + "epoch": 0.39, + "grad_norm": 1.1709912846319916, + "learning_rate": 1.4061473957899194e-05, + "loss": 0.5673, + "step": 4974 + }, + { + "epoch": 0.39, + "grad_norm": 1.194542281384923, + "learning_rate": 1.4059177731832358e-05, + "loss": 0.6002, + "step": 4975 + }, + { + "epoch": 0.39, + "grad_norm": 1.1589470929666976, + "learning_rate": 1.4056881249488692e-05, + "loss": 0.5775, + "step": 4976 + }, + { + "epoch": 0.39, + "grad_norm": 1.1443728184327646, + "learning_rate": 1.4054584511013178e-05, + "loss": 0.5174, + "step": 4977 + }, + { + "epoch": 0.39, + "grad_norm": 1.202087053704193, + "learning_rate": 1.4052287516550824e-05, + "loss": 0.5972, + "step": 4978 + }, + { + "epoch": 0.39, + "grad_norm": 1.2615736650601708, + "learning_rate": 1.4049990266246652e-05, + "loss": 0.6203, + "step": 4979 + }, + { + "epoch": 0.39, + "grad_norm": 1.174427761579736, + "learning_rate": 1.4047692760245702e-05, + "loss": 0.5885, + "step": 4980 + }, + { + "epoch": 0.39, + "grad_norm": 1.1007120623751325, + "learning_rate": 1.4045394998693022e-05, + "loss": 0.5585, + "step": 4981 + }, + { + "epoch": 0.39, + "grad_norm": 1.2990526048376556, + "learning_rate": 1.4043096981733687e-05, + "loss": 0.6313, + "step": 4982 + }, + { + "epoch": 0.39, + "grad_norm": 1.195364857443232, + "learning_rate": 1.4040798709512777e-05, + "loss": 0.5521, + "step": 4983 + }, + { + "epoch": 0.39, + "grad_norm": 1.148793327585384, + "learning_rate": 1.4038500182175399e-05, + "loss": 0.5952, + "step": 4984 + }, + { + "epoch": 0.39, + "grad_norm": 1.2284884548366712, + "learning_rate": 1.4036201399866669e-05, + "loss": 0.5935, + "step": 4985 + }, + { + "epoch": 0.39, + "grad_norm": 1.197719817642587, + "learning_rate": 1.4033902362731719e-05, + "loss": 0.5765, + "step": 4986 + }, + { + "epoch": 0.39, + "grad_norm": 1.1187142030753594, + "learning_rate": 1.40316030709157e-05, + "loss": 0.5516, + "step": 4987 + }, + { + "epoch": 0.39, + "grad_norm": 1.1827435351162627, + "learning_rate": 1.402930352456378e-05, + "loss": 0.5638, + "step": 4988 + }, + { + "epoch": 0.39, + "grad_norm": 1.2784737537869166, + "learning_rate": 1.402700372382114e-05, + "loss": 0.5931, + "step": 4989 + }, + { + "epoch": 0.39, + "grad_norm": 1.1319007624575315, + "learning_rate": 1.4024703668832978e-05, + "loss": 0.5451, + "step": 4990 + }, + { + "epoch": 0.39, + "grad_norm": 1.2729598477662063, + "learning_rate": 1.402240335974451e-05, + "loss": 0.5577, + "step": 4991 + }, + { + "epoch": 0.39, + "grad_norm": 1.2129385381255733, + "learning_rate": 1.4020102796700962e-05, + "loss": 0.5971, + "step": 4992 + }, + { + "epoch": 0.39, + "grad_norm": 1.1671036287990175, + "learning_rate": 1.4017801979847586e-05, + "loss": 0.6157, + "step": 4993 + }, + { + "epoch": 0.39, + "grad_norm": 1.1993560712393094, + "learning_rate": 1.401550090932964e-05, + "loss": 0.5694, + "step": 4994 + }, + { + "epoch": 0.39, + "grad_norm": 1.1787565992055329, + "learning_rate": 1.4013199585292405e-05, + "loss": 0.5989, + "step": 4995 + }, + { + "epoch": 0.39, + "grad_norm": 1.1017810253070914, + "learning_rate": 1.4010898007881177e-05, + "loss": 0.5756, + "step": 4996 + }, + { + "epoch": 0.39, + "grad_norm": 1.0318563008361465, + "learning_rate": 1.400859617724126e-05, + "loss": 0.5273, + "step": 4997 + }, + { + "epoch": 0.39, + "grad_norm": 1.0754525607086431, + "learning_rate": 1.4006294093517989e-05, + "loss": 0.5224, + "step": 4998 + }, + { + "epoch": 0.39, + "grad_norm": 1.0694144720911443, + "learning_rate": 1.4003991756856699e-05, + "loss": 0.5834, + "step": 4999 + }, + { + "epoch": 0.39, + "grad_norm": 1.250856392275336, + "learning_rate": 1.4001689167402752e-05, + "loss": 0.5918, + "step": 5000 + }, + { + "epoch": 0.39, + "grad_norm": 1.1297402122936293, + "learning_rate": 1.3999386325301525e-05, + "loss": 0.6011, + "step": 5001 + }, + { + "epoch": 0.39, + "grad_norm": 1.10318884948437, + "learning_rate": 1.39970832306984e-05, + "loss": 0.5746, + "step": 5002 + }, + { + "epoch": 0.39, + "grad_norm": 1.1938638822548917, + "learning_rate": 1.3994779883738794e-05, + "loss": 0.6109, + "step": 5003 + }, + { + "epoch": 0.39, + "grad_norm": 1.1559941807834713, + "learning_rate": 1.399247628456812e-05, + "loss": 0.5714, + "step": 5004 + }, + { + "epoch": 0.39, + "grad_norm": 1.1841538114518846, + "learning_rate": 1.3990172433331819e-05, + "loss": 0.5876, + "step": 5005 + }, + { + "epoch": 0.39, + "grad_norm": 1.182314745079797, + "learning_rate": 1.398786833017535e-05, + "loss": 0.5583, + "step": 5006 + }, + { + "epoch": 0.39, + "grad_norm": 1.0280871584343054, + "learning_rate": 1.3985563975244175e-05, + "loss": 0.536, + "step": 5007 + }, + { + "epoch": 0.39, + "grad_norm": 1.1811960944738003, + "learning_rate": 1.3983259368683784e-05, + "loss": 0.5858, + "step": 5008 + }, + { + "epoch": 0.39, + "grad_norm": 1.1201477312681183, + "learning_rate": 1.3980954510639677e-05, + "loss": 0.5802, + "step": 5009 + }, + { + "epoch": 0.39, + "grad_norm": 1.16789270874303, + "learning_rate": 1.3978649401257375e-05, + "loss": 0.6132, + "step": 5010 + }, + { + "epoch": 0.39, + "grad_norm": 1.271520611177799, + "learning_rate": 1.3976344040682409e-05, + "loss": 0.6627, + "step": 5011 + }, + { + "epoch": 0.39, + "grad_norm": 1.1646887099758994, + "learning_rate": 1.3974038429060326e-05, + "loss": 0.5353, + "step": 5012 + }, + { + "epoch": 0.39, + "grad_norm": 1.0602171439755919, + "learning_rate": 1.3971732566536695e-05, + "loss": 0.526, + "step": 5013 + }, + { + "epoch": 0.39, + "grad_norm": 1.1034120224112942, + "learning_rate": 1.3969426453257096e-05, + "loss": 0.5937, + "step": 5014 + }, + { + "epoch": 0.39, + "grad_norm": 1.2315280290574246, + "learning_rate": 1.3967120089367128e-05, + "loss": 0.5864, + "step": 5015 + }, + { + "epoch": 0.39, + "grad_norm": 1.0512289915582456, + "learning_rate": 1.3964813475012398e-05, + "loss": 0.5216, + "step": 5016 + }, + { + "epoch": 0.39, + "grad_norm": 1.198551750456754, + "learning_rate": 1.3962506610338537e-05, + "loss": 0.5415, + "step": 5017 + }, + { + "epoch": 0.39, + "grad_norm": 1.160523173559806, + "learning_rate": 1.3960199495491192e-05, + "loss": 0.5936, + "step": 5018 + }, + { + "epoch": 0.39, + "grad_norm": 1.0838572934551802, + "learning_rate": 1.3957892130616018e-05, + "loss": 0.5861, + "step": 5019 + }, + { + "epoch": 0.39, + "grad_norm": 1.2284209634914356, + "learning_rate": 1.3955584515858696e-05, + "loss": 0.6109, + "step": 5020 + }, + { + "epoch": 0.39, + "grad_norm": 1.1760741721234578, + "learning_rate": 1.3953276651364918e-05, + "loss": 0.5597, + "step": 5021 + }, + { + "epoch": 0.39, + "grad_norm": 1.2884355812326782, + "learning_rate": 1.3950968537280383e-05, + "loss": 0.5995, + "step": 5022 + }, + { + "epoch": 0.39, + "grad_norm": 1.1295864603488763, + "learning_rate": 1.3948660173750823e-05, + "loss": 0.5355, + "step": 5023 + }, + { + "epoch": 0.39, + "grad_norm": 1.1063434927706548, + "learning_rate": 1.3946351560921974e-05, + "loss": 0.5495, + "step": 5024 + }, + { + "epoch": 0.39, + "grad_norm": 1.2446574001709683, + "learning_rate": 1.3944042698939591e-05, + "loss": 0.6116, + "step": 5025 + }, + { + "epoch": 0.39, + "grad_norm": 1.1797739464620633, + "learning_rate": 1.3941733587949442e-05, + "loss": 0.6027, + "step": 5026 + }, + { + "epoch": 0.39, + "grad_norm": 1.1064585645375302, + "learning_rate": 1.3939424228097314e-05, + "loss": 0.5717, + "step": 5027 + }, + { + "epoch": 0.39, + "grad_norm": 1.1813293551477768, + "learning_rate": 1.3937114619529012e-05, + "loss": 0.5832, + "step": 5028 + }, + { + "epoch": 0.39, + "grad_norm": 1.114436838142661, + "learning_rate": 1.3934804762390351e-05, + "loss": 0.5662, + "step": 5029 + }, + { + "epoch": 0.39, + "grad_norm": 1.1960966741823895, + "learning_rate": 1.3932494656827165e-05, + "loss": 0.5762, + "step": 5030 + }, + { + "epoch": 0.39, + "grad_norm": 1.1692603888891244, + "learning_rate": 1.3930184302985302e-05, + "loss": 0.5788, + "step": 5031 + }, + { + "epoch": 0.39, + "grad_norm": 1.1236667680309351, + "learning_rate": 1.3927873701010625e-05, + "loss": 0.5405, + "step": 5032 + }, + { + "epoch": 0.39, + "grad_norm": 1.2086853742237078, + "learning_rate": 1.3925562851049017e-05, + "loss": 0.6183, + "step": 5033 + }, + { + "epoch": 0.39, + "grad_norm": 1.1783311132349865, + "learning_rate": 1.3923251753246371e-05, + "loss": 0.5797, + "step": 5034 + }, + { + "epoch": 0.39, + "grad_norm": 1.2271953820892694, + "learning_rate": 1.3920940407748607e-05, + "loss": 0.6282, + "step": 5035 + }, + { + "epoch": 0.39, + "grad_norm": 1.2846407798636061, + "learning_rate": 1.3918628814701643e-05, + "loss": 0.6647, + "step": 5036 + }, + { + "epoch": 0.39, + "grad_norm": 1.1136105234513498, + "learning_rate": 1.391631697425142e-05, + "loss": 0.5859, + "step": 5037 + }, + { + "epoch": 0.39, + "grad_norm": 1.1289616871458, + "learning_rate": 1.3914004886543904e-05, + "loss": 0.586, + "step": 5038 + }, + { + "epoch": 0.39, + "grad_norm": 1.097468512828539, + "learning_rate": 1.3911692551725065e-05, + "loss": 0.4819, + "step": 5039 + }, + { + "epoch": 0.39, + "grad_norm": 1.2165046941162556, + "learning_rate": 1.3909379969940894e-05, + "loss": 0.516, + "step": 5040 + }, + { + "epoch": 0.39, + "grad_norm": 1.192142846707376, + "learning_rate": 1.3907067141337397e-05, + "loss": 0.5959, + "step": 5041 + }, + { + "epoch": 0.39, + "grad_norm": 1.0752386072296287, + "learning_rate": 1.3904754066060593e-05, + "loss": 0.5593, + "step": 5042 + }, + { + "epoch": 0.39, + "grad_norm": 1.1220058226748795, + "learning_rate": 1.3902440744256516e-05, + "loss": 0.5524, + "step": 5043 + }, + { + "epoch": 0.39, + "grad_norm": 1.0781451237224033, + "learning_rate": 1.3900127176071226e-05, + "loss": 0.5714, + "step": 5044 + }, + { + "epoch": 0.39, + "grad_norm": 1.1659100099715796, + "learning_rate": 1.3897813361650783e-05, + "loss": 0.5683, + "step": 5045 + }, + { + "epoch": 0.39, + "grad_norm": 1.1880990073598108, + "learning_rate": 1.3895499301141273e-05, + "loss": 0.5649, + "step": 5046 + }, + { + "epoch": 0.39, + "grad_norm": 1.134200935565453, + "learning_rate": 1.389318499468879e-05, + "loss": 0.59, + "step": 5047 + }, + { + "epoch": 0.39, + "grad_norm": 1.2836453910929442, + "learning_rate": 1.3890870442439455e-05, + "loss": 0.559, + "step": 5048 + }, + { + "epoch": 0.39, + "grad_norm": 1.1648478578921058, + "learning_rate": 1.3888555644539397e-05, + "loss": 0.6468, + "step": 5049 + }, + { + "epoch": 0.39, + "grad_norm": 1.1400754466940612, + "learning_rate": 1.3886240601134757e-05, + "loss": 0.5487, + "step": 5050 + }, + { + "epoch": 0.39, + "grad_norm": 1.0193843808844685, + "learning_rate": 1.3883925312371698e-05, + "loss": 0.5642, + "step": 5051 + }, + { + "epoch": 0.39, + "grad_norm": 1.0672951930207055, + "learning_rate": 1.3881609778396394e-05, + "loss": 0.5826, + "step": 5052 + }, + { + "epoch": 0.39, + "grad_norm": 1.172555192002202, + "learning_rate": 1.3879293999355037e-05, + "loss": 0.5745, + "step": 5053 + }, + { + "epoch": 0.39, + "grad_norm": 1.1489998096070826, + "learning_rate": 1.387697797539384e-05, + "loss": 0.5357, + "step": 5054 + }, + { + "epoch": 0.39, + "grad_norm": 1.2987461150242523, + "learning_rate": 1.3874661706659018e-05, + "loss": 0.5867, + "step": 5055 + }, + { + "epoch": 0.39, + "grad_norm": 1.1839521010382115, + "learning_rate": 1.3872345193296812e-05, + "loss": 0.6301, + "step": 5056 + }, + { + "epoch": 0.39, + "grad_norm": 1.0940891285074374, + "learning_rate": 1.3870028435453476e-05, + "loss": 0.5202, + "step": 5057 + }, + { + "epoch": 0.39, + "grad_norm": 1.1252981426633528, + "learning_rate": 1.3867711433275275e-05, + "loss": 0.6074, + "step": 5058 + }, + { + "epoch": 0.39, + "grad_norm": 1.1649098224693075, + "learning_rate": 1.38653941869085e-05, + "loss": 0.5616, + "step": 5059 + }, + { + "epoch": 0.39, + "grad_norm": 1.1961083349613146, + "learning_rate": 1.3863076696499447e-05, + "loss": 0.5461, + "step": 5060 + }, + { + "epoch": 0.39, + "grad_norm": 1.0949045354781428, + "learning_rate": 1.3860758962194432e-05, + "loss": 0.5873, + "step": 5061 + }, + { + "epoch": 0.39, + "grad_norm": 1.3233252918005483, + "learning_rate": 1.385844098413978e-05, + "loss": 0.605, + "step": 5062 + }, + { + "epoch": 0.39, + "grad_norm": 1.225336081276207, + "learning_rate": 1.3856122762481845e-05, + "loss": 0.5775, + "step": 5063 + }, + { + "epoch": 0.39, + "grad_norm": 1.189911251445215, + "learning_rate": 1.3853804297366986e-05, + "loss": 0.5683, + "step": 5064 + }, + { + "epoch": 0.39, + "grad_norm": 1.1688029904515929, + "learning_rate": 1.3851485588941578e-05, + "loss": 0.569, + "step": 5065 + }, + { + "epoch": 0.39, + "grad_norm": 1.1842199500863406, + "learning_rate": 1.3849166637352011e-05, + "loss": 0.5554, + "step": 5066 + }, + { + "epoch": 0.39, + "grad_norm": 1.1851311697974825, + "learning_rate": 1.38468474427447e-05, + "loss": 0.5569, + "step": 5067 + }, + { + "epoch": 0.39, + "grad_norm": 1.0676355228564274, + "learning_rate": 1.3844528005266057e-05, + "loss": 0.5444, + "step": 5068 + }, + { + "epoch": 0.39, + "grad_norm": 1.0443988953007817, + "learning_rate": 1.3842208325062532e-05, + "loss": 0.5232, + "step": 5069 + }, + { + "epoch": 0.39, + "grad_norm": 1.1501182142948272, + "learning_rate": 1.383988840228057e-05, + "loss": 0.553, + "step": 5070 + }, + { + "epoch": 0.39, + "grad_norm": 1.0773497848950297, + "learning_rate": 1.3837568237066641e-05, + "loss": 0.5317, + "step": 5071 + }, + { + "epoch": 0.39, + "grad_norm": 1.0995766887244798, + "learning_rate": 1.383524782956723e-05, + "loss": 0.5444, + "step": 5072 + }, + { + "epoch": 0.39, + "grad_norm": 1.1433419046738715, + "learning_rate": 1.3832927179928834e-05, + "loss": 0.5872, + "step": 5073 + }, + { + "epoch": 0.39, + "grad_norm": 1.2043275644238283, + "learning_rate": 1.3830606288297976e-05, + "loss": 0.5952, + "step": 5074 + }, + { + "epoch": 0.39, + "grad_norm": 1.2522080945881557, + "learning_rate": 1.3828285154821175e-05, + "loss": 0.5521, + "step": 5075 + }, + { + "epoch": 0.39, + "grad_norm": 1.1262906935746209, + "learning_rate": 1.3825963779644981e-05, + "loss": 0.575, + "step": 5076 + }, + { + "epoch": 0.39, + "grad_norm": 1.1679750780425355, + "learning_rate": 1.3823642162915958e-05, + "loss": 0.4945, + "step": 5077 + }, + { + "epoch": 0.39, + "grad_norm": 1.2046123575210388, + "learning_rate": 1.3821320304780671e-05, + "loss": 0.5832, + "step": 5078 + }, + { + "epoch": 0.39, + "grad_norm": 1.0860870038864228, + "learning_rate": 1.3818998205385726e-05, + "loss": 0.512, + "step": 5079 + }, + { + "epoch": 0.39, + "grad_norm": 1.0958168664330807, + "learning_rate": 1.3816675864877715e-05, + "loss": 0.5357, + "step": 5080 + }, + { + "epoch": 0.39, + "grad_norm": 1.14363766311371, + "learning_rate": 1.3814353283403266e-05, + "loss": 0.5657, + "step": 5081 + }, + { + "epoch": 0.39, + "grad_norm": 1.0858749261844856, + "learning_rate": 1.3812030461109017e-05, + "loss": 0.4903, + "step": 5082 + }, + { + "epoch": 0.39, + "grad_norm": 1.2557622180809869, + "learning_rate": 1.3809707398141615e-05, + "loss": 0.6093, + "step": 5083 + }, + { + "epoch": 0.39, + "grad_norm": 1.277200346663464, + "learning_rate": 1.3807384094647728e-05, + "loss": 0.6032, + "step": 5084 + }, + { + "epoch": 0.39, + "grad_norm": 1.2659834836783417, + "learning_rate": 1.3805060550774045e-05, + "loss": 0.5976, + "step": 5085 + }, + { + "epoch": 0.39, + "grad_norm": 1.1807909219771489, + "learning_rate": 1.3802736766667253e-05, + "loss": 0.5401, + "step": 5086 + }, + { + "epoch": 0.39, + "grad_norm": 1.2161290768674509, + "learning_rate": 1.3800412742474074e-05, + "loss": 0.6131, + "step": 5087 + }, + { + "epoch": 0.39, + "grad_norm": 1.2514854664615915, + "learning_rate": 1.3798088478341226e-05, + "loss": 0.6247, + "step": 5088 + }, + { + "epoch": 0.39, + "grad_norm": 1.2108117315138005, + "learning_rate": 1.3795763974415459e-05, + "loss": 0.5639, + "step": 5089 + }, + { + "epoch": 0.39, + "grad_norm": 1.171089417676693, + "learning_rate": 1.379343923084353e-05, + "loss": 0.5985, + "step": 5090 + }, + { + "epoch": 0.39, + "grad_norm": 1.0987285114569603, + "learning_rate": 1.379111424777221e-05, + "loss": 0.5674, + "step": 5091 + }, + { + "epoch": 0.4, + "grad_norm": 1.2470046394280194, + "learning_rate": 1.3788789025348288e-05, + "loss": 0.5939, + "step": 5092 + }, + { + "epoch": 0.4, + "grad_norm": 1.2564369404183413, + "learning_rate": 1.3786463563718564e-05, + "loss": 0.6035, + "step": 5093 + }, + { + "epoch": 0.4, + "grad_norm": 1.3448791638942845, + "learning_rate": 1.3784137863029864e-05, + "loss": 0.6392, + "step": 5094 + }, + { + "epoch": 0.4, + "grad_norm": 1.3296999848941775, + "learning_rate": 1.3781811923429018e-05, + "loss": 0.6535, + "step": 5095 + }, + { + "epoch": 0.4, + "grad_norm": 1.1156698201897324, + "learning_rate": 1.3779485745062869e-05, + "loss": 0.5705, + "step": 5096 + }, + { + "epoch": 0.4, + "grad_norm": 1.1670516888760163, + "learning_rate": 1.3777159328078289e-05, + "loss": 0.621, + "step": 5097 + }, + { + "epoch": 0.4, + "grad_norm": 1.1223376879341824, + "learning_rate": 1.377483267262215e-05, + "loss": 0.5483, + "step": 5098 + }, + { + "epoch": 0.4, + "grad_norm": 1.0578393817230047, + "learning_rate": 1.3772505778841351e-05, + "loss": 0.532, + "step": 5099 + }, + { + "epoch": 0.4, + "grad_norm": 1.1010552719752902, + "learning_rate": 1.37701786468828e-05, + "loss": 0.5705, + "step": 5100 + }, + { + "epoch": 0.4, + "grad_norm": 1.2344446283331352, + "learning_rate": 1.3767851276893415e-05, + "loss": 0.5687, + "step": 5101 + }, + { + "epoch": 0.4, + "grad_norm": 1.1179940573375722, + "learning_rate": 1.3765523669020144e-05, + "loss": 0.528, + "step": 5102 + }, + { + "epoch": 0.4, + "grad_norm": 1.2326976147245252, + "learning_rate": 1.3763195823409932e-05, + "loss": 0.5715, + "step": 5103 + }, + { + "epoch": 0.4, + "grad_norm": 1.1652079841062606, + "learning_rate": 1.3760867740209755e-05, + "loss": 0.5853, + "step": 5104 + }, + { + "epoch": 0.4, + "grad_norm": 1.096963303970221, + "learning_rate": 1.3758539419566595e-05, + "loss": 0.5749, + "step": 5105 + }, + { + "epoch": 0.4, + "grad_norm": 1.0746071408136593, + "learning_rate": 1.3756210861627446e-05, + "loss": 0.5525, + "step": 5106 + }, + { + "epoch": 0.4, + "grad_norm": 1.0965460324729155, + "learning_rate": 1.3753882066539328e-05, + "loss": 0.5335, + "step": 5107 + }, + { + "epoch": 0.4, + "grad_norm": 1.1645182287116982, + "learning_rate": 1.3751553034449267e-05, + "loss": 0.6002, + "step": 5108 + }, + { + "epoch": 0.4, + "grad_norm": 1.157325038663377, + "learning_rate": 1.374922376550431e-05, + "loss": 0.5593, + "step": 5109 + }, + { + "epoch": 0.4, + "grad_norm": 1.1870787023791176, + "learning_rate": 1.3746894259851513e-05, + "loss": 0.5456, + "step": 5110 + }, + { + "epoch": 0.4, + "grad_norm": 1.302207233891866, + "learning_rate": 1.3744564517637947e-05, + "loss": 0.5702, + "step": 5111 + }, + { + "epoch": 0.4, + "grad_norm": 1.1542231863805963, + "learning_rate": 1.3742234539010703e-05, + "loss": 0.5469, + "step": 5112 + }, + { + "epoch": 0.4, + "grad_norm": 1.1879417701957935, + "learning_rate": 1.3739904324116888e-05, + "loss": 0.5793, + "step": 5113 + }, + { + "epoch": 0.4, + "grad_norm": 1.1136928935026065, + "learning_rate": 1.3737573873103615e-05, + "loss": 0.5757, + "step": 5114 + }, + { + "epoch": 0.4, + "grad_norm": 1.2274395182328943, + "learning_rate": 1.3735243186118026e-05, + "loss": 0.5635, + "step": 5115 + }, + { + "epoch": 0.4, + "grad_norm": 1.1150440915546074, + "learning_rate": 1.3732912263307256e-05, + "loss": 0.5287, + "step": 5116 + }, + { + "epoch": 0.4, + "grad_norm": 1.225364099616156, + "learning_rate": 1.3730581104818477e-05, + "loss": 0.5333, + "step": 5117 + }, + { + "epoch": 0.4, + "grad_norm": 1.1317513066430256, + "learning_rate": 1.372824971079887e-05, + "loss": 0.5736, + "step": 5118 + }, + { + "epoch": 0.4, + "grad_norm": 1.209555988420983, + "learning_rate": 1.3725918081395619e-05, + "loss": 0.5854, + "step": 5119 + }, + { + "epoch": 0.4, + "grad_norm": 1.1674513675599836, + "learning_rate": 1.372358621675594e-05, + "loss": 0.5563, + "step": 5120 + }, + { + "epoch": 0.4, + "grad_norm": 1.223065390091859, + "learning_rate": 1.3721254117027049e-05, + "loss": 0.6077, + "step": 5121 + }, + { + "epoch": 0.4, + "grad_norm": 1.1131230827542071, + "learning_rate": 1.3718921782356187e-05, + "loss": 0.5505, + "step": 5122 + }, + { + "epoch": 0.4, + "grad_norm": 1.270391835718321, + "learning_rate": 1.3716589212890608e-05, + "loss": 0.6009, + "step": 5123 + }, + { + "epoch": 0.4, + "grad_norm": 1.2485479026743516, + "learning_rate": 1.3714256408777575e-05, + "loss": 0.6012, + "step": 5124 + }, + { + "epoch": 0.4, + "grad_norm": 1.208180445283391, + "learning_rate": 1.3711923370164373e-05, + "loss": 0.6001, + "step": 5125 + }, + { + "epoch": 0.4, + "grad_norm": 1.2076025046206034, + "learning_rate": 1.3709590097198295e-05, + "loss": 0.5781, + "step": 5126 + }, + { + "epoch": 0.4, + "grad_norm": 1.0729987412451536, + "learning_rate": 1.3707256590026656e-05, + "loss": 0.517, + "step": 5127 + }, + { + "epoch": 0.4, + "grad_norm": 1.181185144329341, + "learning_rate": 1.3704922848796782e-05, + "loss": 0.5925, + "step": 5128 + }, + { + "epoch": 0.4, + "grad_norm": 1.14652955260288, + "learning_rate": 1.3702588873656015e-05, + "loss": 0.5673, + "step": 5129 + }, + { + "epoch": 0.4, + "grad_norm": 1.2077652264383187, + "learning_rate": 1.370025466475171e-05, + "loss": 0.6318, + "step": 5130 + }, + { + "epoch": 0.4, + "grad_norm": 1.2391190447418405, + "learning_rate": 1.3697920222231233e-05, + "loss": 0.5819, + "step": 5131 + }, + { + "epoch": 0.4, + "grad_norm": 1.0633947868187816, + "learning_rate": 1.3695585546241977e-05, + "loss": 0.5433, + "step": 5132 + }, + { + "epoch": 0.4, + "grad_norm": 1.1979733937418402, + "learning_rate": 1.3693250636931339e-05, + "loss": 0.571, + "step": 5133 + }, + { + "epoch": 0.4, + "grad_norm": 1.278536318603655, + "learning_rate": 1.3690915494446732e-05, + "loss": 0.5792, + "step": 5134 + }, + { + "epoch": 0.4, + "grad_norm": 1.2121630957620921, + "learning_rate": 1.3688580118935591e-05, + "loss": 0.5875, + "step": 5135 + }, + { + "epoch": 0.4, + "grad_norm": 1.1831097510272532, + "learning_rate": 1.3686244510545353e-05, + "loss": 0.5541, + "step": 5136 + }, + { + "epoch": 0.4, + "grad_norm": 1.1002999503653201, + "learning_rate": 1.3683908669423479e-05, + "loss": 0.5572, + "step": 5137 + }, + { + "epoch": 0.4, + "grad_norm": 1.1298810188984791, + "learning_rate": 1.3681572595717446e-05, + "loss": 0.584, + "step": 5138 + }, + { + "epoch": 0.4, + "grad_norm": 1.0699068400568583, + "learning_rate": 1.3679236289574742e-05, + "loss": 0.5415, + "step": 5139 + }, + { + "epoch": 0.4, + "grad_norm": 1.1055363610825557, + "learning_rate": 1.3676899751142866e-05, + "loss": 0.5655, + "step": 5140 + }, + { + "epoch": 0.4, + "grad_norm": 1.1047699677630374, + "learning_rate": 1.3674562980569342e-05, + "loss": 0.6065, + "step": 5141 + }, + { + "epoch": 0.4, + "grad_norm": 1.1178692424855816, + "learning_rate": 1.3672225978001694e-05, + "loss": 0.5032, + "step": 5142 + }, + { + "epoch": 0.4, + "grad_norm": 1.1723398939514398, + "learning_rate": 1.3669888743587478e-05, + "loss": 0.6144, + "step": 5143 + }, + { + "epoch": 0.4, + "grad_norm": 1.1890604407634775, + "learning_rate": 1.3667551277474252e-05, + "loss": 0.5509, + "step": 5144 + }, + { + "epoch": 0.4, + "grad_norm": 1.2264492687272461, + "learning_rate": 1.3665213579809588e-05, + "loss": 0.6048, + "step": 5145 + }, + { + "epoch": 0.4, + "grad_norm": 1.1425296088457548, + "learning_rate": 1.3662875650741082e-05, + "loss": 0.558, + "step": 5146 + }, + { + "epoch": 0.4, + "grad_norm": 1.1903742588773563, + "learning_rate": 1.3660537490416337e-05, + "loss": 0.6399, + "step": 5147 + }, + { + "epoch": 0.4, + "grad_norm": 1.059610364306384, + "learning_rate": 1.3658199098982979e-05, + "loss": 0.5719, + "step": 5148 + }, + { + "epoch": 0.4, + "grad_norm": 1.3932991034046338, + "learning_rate": 1.3655860476588636e-05, + "loss": 0.6588, + "step": 5149 + }, + { + "epoch": 0.4, + "grad_norm": 1.3501578733180393, + "learning_rate": 1.3653521623380956e-05, + "loss": 0.6182, + "step": 5150 + }, + { + "epoch": 0.4, + "grad_norm": 1.133479691386181, + "learning_rate": 1.365118253950761e-05, + "loss": 0.5656, + "step": 5151 + }, + { + "epoch": 0.4, + "grad_norm": 1.1529509932565465, + "learning_rate": 1.364884322511627e-05, + "loss": 0.5785, + "step": 5152 + }, + { + "epoch": 0.4, + "grad_norm": 1.1952192980763319, + "learning_rate": 1.3646503680354633e-05, + "loss": 0.5223, + "step": 5153 + }, + { + "epoch": 0.4, + "grad_norm": 1.0270377163819115, + "learning_rate": 1.3644163905370406e-05, + "loss": 0.5162, + "step": 5154 + }, + { + "epoch": 0.4, + "grad_norm": 1.081539276323409, + "learning_rate": 1.3641823900311308e-05, + "loss": 0.4914, + "step": 5155 + }, + { + "epoch": 0.4, + "grad_norm": 1.2481224741597374, + "learning_rate": 1.3639483665325082e-05, + "loss": 0.5705, + "step": 5156 + }, + { + "epoch": 0.4, + "grad_norm": 1.239927671461129, + "learning_rate": 1.363714320055947e-05, + "loss": 0.6092, + "step": 5157 + }, + { + "epoch": 0.4, + "grad_norm": 1.1684606020434887, + "learning_rate": 1.3634802506162246e-05, + "loss": 0.5975, + "step": 5158 + }, + { + "epoch": 0.4, + "grad_norm": 1.159317484043346, + "learning_rate": 1.3632461582281184e-05, + "loss": 0.536, + "step": 5159 + }, + { + "epoch": 0.4, + "grad_norm": 1.2963703966280629, + "learning_rate": 1.3630120429064082e-05, + "loss": 0.6333, + "step": 5160 + }, + { + "epoch": 0.4, + "grad_norm": 1.1223450698642972, + "learning_rate": 1.3627779046658752e-05, + "loss": 0.5367, + "step": 5161 + }, + { + "epoch": 0.4, + "grad_norm": 1.152854780257862, + "learning_rate": 1.3625437435213009e-05, + "loss": 0.5699, + "step": 5162 + }, + { + "epoch": 0.4, + "grad_norm": 1.1573887964800986, + "learning_rate": 1.3623095594874698e-05, + "loss": 0.5517, + "step": 5163 + }, + { + "epoch": 0.4, + "grad_norm": 1.1705242255766188, + "learning_rate": 1.362075352579167e-05, + "loss": 0.5387, + "step": 5164 + }, + { + "epoch": 0.4, + "grad_norm": 1.2282702954566522, + "learning_rate": 1.361841122811179e-05, + "loss": 0.5727, + "step": 5165 + }, + { + "epoch": 0.4, + "grad_norm": 1.1651345251451357, + "learning_rate": 1.361606870198294e-05, + "loss": 0.6193, + "step": 5166 + }, + { + "epoch": 0.4, + "grad_norm": 1.2741724245595392, + "learning_rate": 1.3613725947553018e-05, + "loss": 0.5958, + "step": 5167 + }, + { + "epoch": 0.4, + "grad_norm": 1.1994403547378543, + "learning_rate": 1.3611382964969933e-05, + "loss": 0.6221, + "step": 5168 + }, + { + "epoch": 0.4, + "grad_norm": 1.2577098543921341, + "learning_rate": 1.360903975438161e-05, + "loss": 0.5529, + "step": 5169 + }, + { + "epoch": 0.4, + "grad_norm": 1.2031053318546336, + "learning_rate": 1.3606696315935983e-05, + "loss": 0.5918, + "step": 5170 + }, + { + "epoch": 0.4, + "grad_norm": 1.2037690655712658, + "learning_rate": 1.3604352649781012e-05, + "loss": 0.5967, + "step": 5171 + }, + { + "epoch": 0.4, + "grad_norm": 1.1690285253903214, + "learning_rate": 1.3602008756064661e-05, + "loss": 0.6055, + "step": 5172 + }, + { + "epoch": 0.4, + "grad_norm": 1.1573498108979243, + "learning_rate": 1.3599664634934918e-05, + "loss": 0.566, + "step": 5173 + }, + { + "epoch": 0.4, + "grad_norm": 1.1630579050627143, + "learning_rate": 1.359732028653977e-05, + "loss": 0.5949, + "step": 5174 + }, + { + "epoch": 0.4, + "grad_norm": 1.1540691841814061, + "learning_rate": 1.3594975711027231e-05, + "loss": 0.596, + "step": 5175 + }, + { + "epoch": 0.4, + "grad_norm": 1.1927771513740384, + "learning_rate": 1.3592630908545332e-05, + "loss": 0.5648, + "step": 5176 + }, + { + "epoch": 0.4, + "grad_norm": 1.0961138113718683, + "learning_rate": 1.3590285879242107e-05, + "loss": 0.5902, + "step": 5177 + }, + { + "epoch": 0.4, + "grad_norm": 1.1605104361725986, + "learning_rate": 1.358794062326561e-05, + "loss": 0.5967, + "step": 5178 + }, + { + "epoch": 0.4, + "grad_norm": 1.0735656972680843, + "learning_rate": 1.3585595140763912e-05, + "loss": 0.5982, + "step": 5179 + }, + { + "epoch": 0.4, + "grad_norm": 1.1756079156294366, + "learning_rate": 1.3583249431885091e-05, + "loss": 0.6237, + "step": 5180 + }, + { + "epoch": 0.4, + "grad_norm": 1.137341253500891, + "learning_rate": 1.3580903496777248e-05, + "loss": 0.5724, + "step": 5181 + }, + { + "epoch": 0.4, + "grad_norm": 1.2936730440259925, + "learning_rate": 1.3578557335588492e-05, + "loss": 0.6853, + "step": 5182 + }, + { + "epoch": 0.4, + "grad_norm": 1.160855528862785, + "learning_rate": 1.3576210948466948e-05, + "loss": 0.5354, + "step": 5183 + }, + { + "epoch": 0.4, + "grad_norm": 1.1667562972561536, + "learning_rate": 1.3573864335560756e-05, + "loss": 0.5645, + "step": 5184 + }, + { + "epoch": 0.4, + "grad_norm": 1.1011772293658901, + "learning_rate": 1.357151749701807e-05, + "loss": 0.5405, + "step": 5185 + }, + { + "epoch": 0.4, + "grad_norm": 1.1593123426754495, + "learning_rate": 1.3569170432987058e-05, + "loss": 0.5793, + "step": 5186 + }, + { + "epoch": 0.4, + "grad_norm": 1.141263025633567, + "learning_rate": 1.35668231436159e-05, + "loss": 0.5594, + "step": 5187 + }, + { + "epoch": 0.4, + "grad_norm": 1.1514834020977904, + "learning_rate": 1.3564475629052798e-05, + "loss": 0.5516, + "step": 5188 + }, + { + "epoch": 0.4, + "grad_norm": 1.1484035045756322, + "learning_rate": 1.3562127889445958e-05, + "loss": 0.6041, + "step": 5189 + }, + { + "epoch": 0.4, + "grad_norm": 1.286528947053089, + "learning_rate": 1.3559779924943603e-05, + "loss": 0.6325, + "step": 5190 + }, + { + "epoch": 0.4, + "grad_norm": 1.3521997790553795, + "learning_rate": 1.355743173569398e-05, + "loss": 0.6333, + "step": 5191 + }, + { + "epoch": 0.4, + "grad_norm": 1.2142113514577728, + "learning_rate": 1.3555083321845335e-05, + "loss": 0.5448, + "step": 5192 + }, + { + "epoch": 0.4, + "grad_norm": 1.1258377029273912, + "learning_rate": 1.355273468354594e-05, + "loss": 0.5584, + "step": 5193 + }, + { + "epoch": 0.4, + "grad_norm": 1.1216301781064957, + "learning_rate": 1.3550385820944074e-05, + "loss": 0.5458, + "step": 5194 + }, + { + "epoch": 0.4, + "grad_norm": 1.2637951184980942, + "learning_rate": 1.3548036734188033e-05, + "loss": 0.587, + "step": 5195 + }, + { + "epoch": 0.4, + "grad_norm": 1.2355631650851207, + "learning_rate": 1.3545687423426128e-05, + "loss": 0.6126, + "step": 5196 + }, + { + "epoch": 0.4, + "grad_norm": 1.085701951202441, + "learning_rate": 1.354333788880668e-05, + "loss": 0.4982, + "step": 5197 + }, + { + "epoch": 0.4, + "grad_norm": 1.1498156731535774, + "learning_rate": 1.3540988130478034e-05, + "loss": 0.5754, + "step": 5198 + }, + { + "epoch": 0.4, + "grad_norm": 1.1721203356472376, + "learning_rate": 1.3538638148588537e-05, + "loss": 0.5825, + "step": 5199 + }, + { + "epoch": 0.4, + "grad_norm": 1.2816878710325035, + "learning_rate": 1.3536287943286555e-05, + "loss": 0.5741, + "step": 5200 + }, + { + "epoch": 0.4, + "grad_norm": 1.2869727564360933, + "learning_rate": 1.353393751472047e-05, + "loss": 0.6197, + "step": 5201 + }, + { + "epoch": 0.4, + "grad_norm": 1.1260877754494374, + "learning_rate": 1.3531586863038678e-05, + "loss": 0.5892, + "step": 5202 + }, + { + "epoch": 0.4, + "grad_norm": 1.2899028004392399, + "learning_rate": 1.3529235988389588e-05, + "loss": 0.5654, + "step": 5203 + }, + { + "epoch": 0.4, + "grad_norm": 1.1726781000570679, + "learning_rate": 1.3526884890921621e-05, + "loss": 0.6146, + "step": 5204 + }, + { + "epoch": 0.4, + "grad_norm": 1.2676549103554768, + "learning_rate": 1.3524533570783214e-05, + "loss": 0.6235, + "step": 5205 + }, + { + "epoch": 0.4, + "grad_norm": 1.1817948674189769, + "learning_rate": 1.3522182028122817e-05, + "loss": 0.5903, + "step": 5206 + }, + { + "epoch": 0.4, + "grad_norm": 1.1638322128774148, + "learning_rate": 1.3519830263088899e-05, + "loss": 0.555, + "step": 5207 + }, + { + "epoch": 0.4, + "grad_norm": 1.7848086362996562, + "learning_rate": 1.3517478275829935e-05, + "loss": 0.5924, + "step": 5208 + }, + { + "epoch": 0.4, + "grad_norm": 1.1323558018315305, + "learning_rate": 1.3515126066494422e-05, + "loss": 0.5676, + "step": 5209 + }, + { + "epoch": 0.4, + "grad_norm": 1.297026889592874, + "learning_rate": 1.351277363523086e-05, + "loss": 0.5744, + "step": 5210 + }, + { + "epoch": 0.4, + "grad_norm": 1.2595048974681344, + "learning_rate": 1.3510420982187777e-05, + "loss": 0.6017, + "step": 5211 + }, + { + "epoch": 0.4, + "grad_norm": 1.185826527827758, + "learning_rate": 1.350806810751371e-05, + "loss": 0.5712, + "step": 5212 + }, + { + "epoch": 0.4, + "grad_norm": 1.202157957161739, + "learning_rate": 1.3505715011357202e-05, + "loss": 0.6262, + "step": 5213 + }, + { + "epoch": 0.4, + "grad_norm": 1.173455253700427, + "learning_rate": 1.3503361693866817e-05, + "loss": 0.5991, + "step": 5214 + }, + { + "epoch": 0.4, + "grad_norm": 1.1796111025293248, + "learning_rate": 1.350100815519113e-05, + "loss": 0.562, + "step": 5215 + }, + { + "epoch": 0.4, + "grad_norm": 1.2338232907959152, + "learning_rate": 1.349865439547874e-05, + "loss": 0.6106, + "step": 5216 + }, + { + "epoch": 0.4, + "grad_norm": 1.1161664551743316, + "learning_rate": 1.3496300414878247e-05, + "loss": 0.5736, + "step": 5217 + }, + { + "epoch": 0.4, + "grad_norm": 1.2048698752332165, + "learning_rate": 1.3493946213538269e-05, + "loss": 0.5271, + "step": 5218 + }, + { + "epoch": 0.4, + "grad_norm": 1.165472981048526, + "learning_rate": 1.3491591791607443e-05, + "loss": 0.5496, + "step": 5219 + }, + { + "epoch": 0.4, + "grad_norm": 1.1799031243725278, + "learning_rate": 1.3489237149234409e-05, + "loss": 0.5648, + "step": 5220 + }, + { + "epoch": 0.41, + "grad_norm": 1.318597435826858, + "learning_rate": 1.3486882286567831e-05, + "loss": 0.563, + "step": 5221 + }, + { + "epoch": 0.41, + "grad_norm": 1.2489523789128933, + "learning_rate": 1.3484527203756388e-05, + "loss": 0.5671, + "step": 5222 + }, + { + "epoch": 0.41, + "grad_norm": 1.2314839368385135, + "learning_rate": 1.3482171900948765e-05, + "loss": 0.5908, + "step": 5223 + }, + { + "epoch": 0.41, + "grad_norm": 1.2438119786601036, + "learning_rate": 1.3479816378293662e-05, + "loss": 0.6246, + "step": 5224 + }, + { + "epoch": 0.41, + "grad_norm": 1.0880687300732523, + "learning_rate": 1.3477460635939799e-05, + "loss": 0.5579, + "step": 5225 + }, + { + "epoch": 0.41, + "grad_norm": 1.138819323459468, + "learning_rate": 1.3475104674035902e-05, + "loss": 0.5878, + "step": 5226 + }, + { + "epoch": 0.41, + "grad_norm": 1.2050894515681492, + "learning_rate": 1.347274849273072e-05, + "loss": 0.5829, + "step": 5227 + }, + { + "epoch": 0.41, + "grad_norm": 1.2568860166304936, + "learning_rate": 1.3470392092173008e-05, + "loss": 0.6067, + "step": 5228 + }, + { + "epoch": 0.41, + "grad_norm": 1.251285178407395, + "learning_rate": 1.3468035472511539e-05, + "loss": 0.6112, + "step": 5229 + }, + { + "epoch": 0.41, + "grad_norm": 1.165477021261257, + "learning_rate": 1.3465678633895096e-05, + "loss": 0.5608, + "step": 5230 + }, + { + "epoch": 0.41, + "grad_norm": 1.229551427503285, + "learning_rate": 1.3463321576472482e-05, + "loss": 0.5796, + "step": 5231 + }, + { + "epoch": 0.41, + "grad_norm": 1.1709672082044502, + "learning_rate": 1.346096430039251e-05, + "loss": 0.5179, + "step": 5232 + }, + { + "epoch": 0.41, + "grad_norm": 1.1369282113279262, + "learning_rate": 1.3458606805804005e-05, + "loss": 0.5679, + "step": 5233 + }, + { + "epoch": 0.41, + "grad_norm": 1.1793896128482513, + "learning_rate": 1.3456249092855805e-05, + "loss": 0.6115, + "step": 5234 + }, + { + "epoch": 0.41, + "grad_norm": 1.2023482354212447, + "learning_rate": 1.345389116169677e-05, + "loss": 0.6598, + "step": 5235 + }, + { + "epoch": 0.41, + "grad_norm": 1.2172925499769187, + "learning_rate": 1.3451533012475765e-05, + "loss": 0.5765, + "step": 5236 + }, + { + "epoch": 0.41, + "grad_norm": 1.149692757654883, + "learning_rate": 1.3449174645341675e-05, + "loss": 0.5298, + "step": 5237 + }, + { + "epoch": 0.41, + "grad_norm": 1.1926994434452942, + "learning_rate": 1.3446816060443395e-05, + "loss": 0.5698, + "step": 5238 + }, + { + "epoch": 0.41, + "grad_norm": 1.0959700261040861, + "learning_rate": 1.344445725792983e-05, + "loss": 0.5416, + "step": 5239 + }, + { + "epoch": 0.41, + "grad_norm": 1.2075576868750588, + "learning_rate": 1.3442098237949913e-05, + "loss": 0.5785, + "step": 5240 + }, + { + "epoch": 0.41, + "grad_norm": 1.178225033912204, + "learning_rate": 1.3439739000652569e-05, + "loss": 0.5528, + "step": 5241 + }, + { + "epoch": 0.41, + "grad_norm": 1.169026842835405, + "learning_rate": 1.343737954618676e-05, + "loss": 0.6329, + "step": 5242 + }, + { + "epoch": 0.41, + "grad_norm": 1.1740442670473006, + "learning_rate": 1.3435019874701444e-05, + "loss": 0.5754, + "step": 5243 + }, + { + "epoch": 0.41, + "grad_norm": 1.1784416339875603, + "learning_rate": 1.3432659986345602e-05, + "loss": 0.5779, + "step": 5244 + }, + { + "epoch": 0.41, + "grad_norm": 1.2582109187541184, + "learning_rate": 1.3430299881268223e-05, + "loss": 0.6197, + "step": 5245 + }, + { + "epoch": 0.41, + "grad_norm": 1.0791516046442569, + "learning_rate": 1.3427939559618314e-05, + "loss": 0.5075, + "step": 5246 + }, + { + "epoch": 0.41, + "grad_norm": 1.141248349776647, + "learning_rate": 1.3425579021544896e-05, + "loss": 0.5397, + "step": 5247 + }, + { + "epoch": 0.41, + "grad_norm": 1.1102728635464842, + "learning_rate": 1.3423218267197005e-05, + "loss": 0.5453, + "step": 5248 + }, + { + "epoch": 0.41, + "grad_norm": 1.2214566521934216, + "learning_rate": 1.3420857296723674e-05, + "loss": 0.5963, + "step": 5249 + }, + { + "epoch": 0.41, + "grad_norm": 1.2928796835472165, + "learning_rate": 1.341849611027398e-05, + "loss": 0.6264, + "step": 5250 + }, + { + "epoch": 0.41, + "grad_norm": 1.1855863410627516, + "learning_rate": 1.3416134707996987e-05, + "loss": 0.5444, + "step": 5251 + }, + { + "epoch": 0.41, + "grad_norm": 1.1259374421495798, + "learning_rate": 1.3413773090041786e-05, + "loss": 0.5365, + "step": 5252 + }, + { + "epoch": 0.41, + "grad_norm": 1.2129149503618208, + "learning_rate": 1.3411411256557476e-05, + "loss": 0.5945, + "step": 5253 + }, + { + "epoch": 0.41, + "grad_norm": 1.1577300831319854, + "learning_rate": 1.3409049207693172e-05, + "loss": 0.5292, + "step": 5254 + }, + { + "epoch": 0.41, + "grad_norm": 1.033072335342668, + "learning_rate": 1.3406686943598003e-05, + "loss": 0.527, + "step": 5255 + }, + { + "epoch": 0.41, + "grad_norm": 1.2454299833050129, + "learning_rate": 1.3404324464421113e-05, + "loss": 0.6421, + "step": 5256 + }, + { + "epoch": 0.41, + "grad_norm": 1.0841114416480866, + "learning_rate": 1.3401961770311655e-05, + "loss": 0.5374, + "step": 5257 + }, + { + "epoch": 0.41, + "grad_norm": 1.1010238196094357, + "learning_rate": 1.3399598861418798e-05, + "loss": 0.5386, + "step": 5258 + }, + { + "epoch": 0.41, + "grad_norm": 1.2353411888923391, + "learning_rate": 1.3397235737891726e-05, + "loss": 0.6201, + "step": 5259 + }, + { + "epoch": 0.41, + "grad_norm": 1.173044816103312, + "learning_rate": 1.3394872399879634e-05, + "loss": 0.5306, + "step": 5260 + }, + { + "epoch": 0.41, + "grad_norm": 1.3148521055884241, + "learning_rate": 1.339250884753173e-05, + "loss": 0.5534, + "step": 5261 + }, + { + "epoch": 0.41, + "grad_norm": 1.1990808165311708, + "learning_rate": 1.3390145080997243e-05, + "loss": 0.5092, + "step": 5262 + }, + { + "epoch": 0.41, + "grad_norm": 1.111259096277895, + "learning_rate": 1.3387781100425407e-05, + "loss": 0.577, + "step": 5263 + }, + { + "epoch": 0.41, + "grad_norm": 1.1874351483756493, + "learning_rate": 1.3385416905965469e-05, + "loss": 0.5754, + "step": 5264 + }, + { + "epoch": 0.41, + "grad_norm": 1.1739894356227907, + "learning_rate": 1.3383052497766695e-05, + "loss": 0.5689, + "step": 5265 + }, + { + "epoch": 0.41, + "grad_norm": 1.182610131391522, + "learning_rate": 1.3380687875978362e-05, + "loss": 0.5888, + "step": 5266 + }, + { + "epoch": 0.41, + "grad_norm": 1.1114020297880585, + "learning_rate": 1.3378323040749764e-05, + "loss": 0.5546, + "step": 5267 + }, + { + "epoch": 0.41, + "grad_norm": 1.3920953242723582, + "learning_rate": 1.3375957992230201e-05, + "loss": 0.604, + "step": 5268 + }, + { + "epoch": 0.41, + "grad_norm": 1.099141375347361, + "learning_rate": 1.3373592730568992e-05, + "loss": 0.5828, + "step": 5269 + }, + { + "epoch": 0.41, + "grad_norm": 1.1208625049883711, + "learning_rate": 1.337122725591547e-05, + "loss": 0.5746, + "step": 5270 + }, + { + "epoch": 0.41, + "grad_norm": 1.187791487405812, + "learning_rate": 1.3368861568418974e-05, + "loss": 0.6015, + "step": 5271 + }, + { + "epoch": 0.41, + "grad_norm": 1.2439445209305997, + "learning_rate": 1.3366495668228869e-05, + "loss": 0.5979, + "step": 5272 + }, + { + "epoch": 0.41, + "grad_norm": 1.2590533468141456, + "learning_rate": 1.3364129555494524e-05, + "loss": 0.5534, + "step": 5273 + }, + { + "epoch": 0.41, + "grad_norm": 1.2511264017436263, + "learning_rate": 1.3361763230365323e-05, + "loss": 0.6046, + "step": 5274 + }, + { + "epoch": 0.41, + "grad_norm": 1.0830822188991507, + "learning_rate": 1.3359396692990667e-05, + "loss": 0.5522, + "step": 5275 + }, + { + "epoch": 0.41, + "grad_norm": 1.2390720958747032, + "learning_rate": 1.3357029943519961e-05, + "loss": 0.6104, + "step": 5276 + }, + { + "epoch": 0.41, + "grad_norm": 1.150432592380982, + "learning_rate": 1.335466298210264e-05, + "loss": 0.5587, + "step": 5277 + }, + { + "epoch": 0.41, + "grad_norm": 1.2277011809806118, + "learning_rate": 1.3352295808888134e-05, + "loss": 0.5624, + "step": 5278 + }, + { + "epoch": 0.41, + "grad_norm": 1.1422140470621214, + "learning_rate": 1.3349928424025897e-05, + "loss": 0.5466, + "step": 5279 + }, + { + "epoch": 0.41, + "grad_norm": 1.09349253212638, + "learning_rate": 1.3347560827665398e-05, + "loss": 0.5071, + "step": 5280 + }, + { + "epoch": 0.41, + "grad_norm": 1.223498216001818, + "learning_rate": 1.3345193019956111e-05, + "loss": 0.6364, + "step": 5281 + }, + { + "epoch": 0.41, + "grad_norm": 1.289006966059338, + "learning_rate": 1.334282500104753e-05, + "loss": 0.5651, + "step": 5282 + }, + { + "epoch": 0.41, + "grad_norm": 1.18752660219612, + "learning_rate": 1.334045677108916e-05, + "loss": 0.6393, + "step": 5283 + }, + { + "epoch": 0.41, + "grad_norm": 1.158542525431692, + "learning_rate": 1.3338088330230518e-05, + "loss": 0.5726, + "step": 5284 + }, + { + "epoch": 0.41, + "grad_norm": 1.197256464713122, + "learning_rate": 1.3335719678621139e-05, + "loss": 0.5801, + "step": 5285 + }, + { + "epoch": 0.41, + "grad_norm": 1.1925979907483184, + "learning_rate": 1.3333350816410568e-05, + "loss": 0.5056, + "step": 5286 + }, + { + "epoch": 0.41, + "grad_norm": 1.1867715207308893, + "learning_rate": 1.3330981743748362e-05, + "loss": 0.5801, + "step": 5287 + }, + { + "epoch": 0.41, + "grad_norm": 1.229042995287446, + "learning_rate": 1.3328612460784091e-05, + "loss": 0.5919, + "step": 5288 + }, + { + "epoch": 0.41, + "grad_norm": 1.2551172890891202, + "learning_rate": 1.3326242967667342e-05, + "loss": 0.5564, + "step": 5289 + }, + { + "epoch": 0.41, + "grad_norm": 1.1319927011564592, + "learning_rate": 1.3323873264547713e-05, + "loss": 0.5559, + "step": 5290 + }, + { + "epoch": 0.41, + "grad_norm": 1.097894554470465, + "learning_rate": 1.3321503351574819e-05, + "loss": 0.5697, + "step": 5291 + }, + { + "epoch": 0.41, + "grad_norm": 1.251819954631812, + "learning_rate": 1.331913322889828e-05, + "loss": 0.5668, + "step": 5292 + }, + { + "epoch": 0.41, + "grad_norm": 1.1211037452597765, + "learning_rate": 1.3316762896667737e-05, + "loss": 0.5977, + "step": 5293 + }, + { + "epoch": 0.41, + "grad_norm": 1.1597795484971984, + "learning_rate": 1.3314392355032837e-05, + "loss": 0.6342, + "step": 5294 + }, + { + "epoch": 0.41, + "grad_norm": 1.2404239059166946, + "learning_rate": 1.3312021604143247e-05, + "loss": 0.5782, + "step": 5295 + }, + { + "epoch": 0.41, + "grad_norm": 1.1565144210754121, + "learning_rate": 1.3309650644148648e-05, + "loss": 0.5637, + "step": 5296 + }, + { + "epoch": 0.41, + "grad_norm": 1.1266292324750615, + "learning_rate": 1.3307279475198729e-05, + "loss": 0.5848, + "step": 5297 + }, + { + "epoch": 0.41, + "grad_norm": 1.2360664560822057, + "learning_rate": 1.3304908097443194e-05, + "loss": 0.5793, + "step": 5298 + }, + { + "epoch": 0.41, + "grad_norm": 1.162907738090381, + "learning_rate": 1.3302536511031755e-05, + "loss": 0.5645, + "step": 5299 + }, + { + "epoch": 0.41, + "grad_norm": 1.1384226295574347, + "learning_rate": 1.3300164716114151e-05, + "loss": 0.5372, + "step": 5300 + }, + { + "epoch": 0.41, + "grad_norm": 1.1184305065157014, + "learning_rate": 1.3297792712840121e-05, + "loss": 0.5599, + "step": 5301 + }, + { + "epoch": 0.41, + "grad_norm": 1.0984647223842157, + "learning_rate": 1.3295420501359424e-05, + "loss": 0.5585, + "step": 5302 + }, + { + "epoch": 0.41, + "grad_norm": 1.303668966582131, + "learning_rate": 1.3293048081821827e-05, + "loss": 0.5829, + "step": 5303 + }, + { + "epoch": 0.41, + "grad_norm": 1.3968011243431138, + "learning_rate": 1.3290675454377114e-05, + "loss": 0.6195, + "step": 5304 + }, + { + "epoch": 0.41, + "grad_norm": 1.2358054069987254, + "learning_rate": 1.3288302619175082e-05, + "loss": 0.562, + "step": 5305 + }, + { + "epoch": 0.41, + "grad_norm": 1.1645383950140562, + "learning_rate": 1.3285929576365541e-05, + "loss": 0.5729, + "step": 5306 + }, + { + "epoch": 0.41, + "grad_norm": 1.176074374847541, + "learning_rate": 1.3283556326098315e-05, + "loss": 0.5978, + "step": 5307 + }, + { + "epoch": 0.41, + "grad_norm": 1.0618518647653676, + "learning_rate": 1.3281182868523235e-05, + "loss": 0.5393, + "step": 5308 + }, + { + "epoch": 0.41, + "grad_norm": 1.1610846609746257, + "learning_rate": 1.327880920379015e-05, + "loss": 0.5998, + "step": 5309 + }, + { + "epoch": 0.41, + "grad_norm": 1.11523271817754, + "learning_rate": 1.3276435332048924e-05, + "loss": 0.5493, + "step": 5310 + }, + { + "epoch": 0.41, + "grad_norm": 1.141820672825066, + "learning_rate": 1.3274061253449434e-05, + "loss": 0.5584, + "step": 5311 + }, + { + "epoch": 0.41, + "grad_norm": 1.1834210058875512, + "learning_rate": 1.3271686968141566e-05, + "loss": 0.5643, + "step": 5312 + }, + { + "epoch": 0.41, + "grad_norm": 1.14208984375, + "learning_rate": 1.3269312476275216e-05, + "loss": 0.5913, + "step": 5313 + }, + { + "epoch": 0.41, + "grad_norm": 1.3027190017365033, + "learning_rate": 1.3266937778000304e-05, + "loss": 0.5957, + "step": 5314 + }, + { + "epoch": 0.41, + "grad_norm": 1.251638435412809, + "learning_rate": 1.3264562873466753e-05, + "loss": 0.5749, + "step": 5315 + }, + { + "epoch": 0.41, + "grad_norm": 1.1748023029154155, + "learning_rate": 1.3262187762824507e-05, + "loss": 0.5329, + "step": 5316 + }, + { + "epoch": 0.41, + "grad_norm": 1.083255887319949, + "learning_rate": 1.3259812446223519e-05, + "loss": 0.529, + "step": 5317 + }, + { + "epoch": 0.41, + "grad_norm": 1.2883288983658967, + "learning_rate": 1.325743692381375e-05, + "loss": 0.5766, + "step": 5318 + }, + { + "epoch": 0.41, + "grad_norm": 1.259754080312896, + "learning_rate": 1.3255061195745182e-05, + "loss": 0.5885, + "step": 5319 + }, + { + "epoch": 0.41, + "grad_norm": 1.1492570820113677, + "learning_rate": 1.3252685262167803e-05, + "loss": 0.5734, + "step": 5320 + }, + { + "epoch": 0.41, + "grad_norm": 1.2455881940779543, + "learning_rate": 1.3250309123231629e-05, + "loss": 0.576, + "step": 5321 + }, + { + "epoch": 0.41, + "grad_norm": 1.1059794510006804, + "learning_rate": 1.3247932779086668e-05, + "loss": 0.57, + "step": 5322 + }, + { + "epoch": 0.41, + "grad_norm": 1.2401644949863038, + "learning_rate": 1.324555622988295e-05, + "loss": 0.5872, + "step": 5323 + }, + { + "epoch": 0.41, + "grad_norm": 1.210066549819751, + "learning_rate": 1.3243179475770527e-05, + "loss": 0.6203, + "step": 5324 + }, + { + "epoch": 0.41, + "grad_norm": 1.2145093844079853, + "learning_rate": 1.324080251689945e-05, + "loss": 0.5659, + "step": 5325 + }, + { + "epoch": 0.41, + "grad_norm": 1.1001292434539458, + "learning_rate": 1.3238425353419788e-05, + "loss": 0.5789, + "step": 5326 + }, + { + "epoch": 0.41, + "grad_norm": 1.1445031276541822, + "learning_rate": 1.3236047985481628e-05, + "loss": 0.5317, + "step": 5327 + }, + { + "epoch": 0.41, + "grad_norm": 1.0828071869338616, + "learning_rate": 1.3233670413235059e-05, + "loss": 0.5317, + "step": 5328 + }, + { + "epoch": 0.41, + "grad_norm": 1.2630660478292075, + "learning_rate": 1.3231292636830198e-05, + "loss": 0.5649, + "step": 5329 + }, + { + "epoch": 0.41, + "grad_norm": 1.1830154869633025, + "learning_rate": 1.3228914656417156e-05, + "loss": 0.5365, + "step": 5330 + }, + { + "epoch": 0.41, + "grad_norm": 1.2508666372133053, + "learning_rate": 1.3226536472146076e-05, + "loss": 0.5445, + "step": 5331 + }, + { + "epoch": 0.41, + "grad_norm": 1.0856833400654309, + "learning_rate": 1.3224158084167104e-05, + "loss": 0.5069, + "step": 5332 + }, + { + "epoch": 0.41, + "grad_norm": 1.23172819078547, + "learning_rate": 1.3221779492630393e-05, + "loss": 0.632, + "step": 5333 + }, + { + "epoch": 0.41, + "grad_norm": 1.2798880339617424, + "learning_rate": 1.3219400697686125e-05, + "loss": 0.5986, + "step": 5334 + }, + { + "epoch": 0.41, + "grad_norm": 1.200395584468608, + "learning_rate": 1.3217021699484476e-05, + "loss": 0.5474, + "step": 5335 + }, + { + "epoch": 0.41, + "grad_norm": 1.1187671617438322, + "learning_rate": 1.3214642498175654e-05, + "loss": 0.5266, + "step": 5336 + }, + { + "epoch": 0.41, + "grad_norm": 1.244319693286147, + "learning_rate": 1.3212263093909865e-05, + "loss": 0.5722, + "step": 5337 + }, + { + "epoch": 0.41, + "grad_norm": 1.2204440643076186, + "learning_rate": 1.320988348683733e-05, + "loss": 0.5923, + "step": 5338 + }, + { + "epoch": 0.41, + "grad_norm": 1.070028893980839, + "learning_rate": 1.3207503677108294e-05, + "loss": 0.5591, + "step": 5339 + }, + { + "epoch": 0.41, + "grad_norm": 1.2247816144168766, + "learning_rate": 1.3205123664872999e-05, + "loss": 0.6196, + "step": 5340 + }, + { + "epoch": 0.41, + "grad_norm": 1.2793022330368473, + "learning_rate": 1.3202743450281714e-05, + "loss": 0.5531, + "step": 5341 + }, + { + "epoch": 0.41, + "grad_norm": 1.1453216219300684, + "learning_rate": 1.3200363033484709e-05, + "loss": 0.552, + "step": 5342 + }, + { + "epoch": 0.41, + "grad_norm": 1.1078184786744194, + "learning_rate": 1.3197982414632272e-05, + "loss": 0.5344, + "step": 5343 + }, + { + "epoch": 0.41, + "grad_norm": 1.2137208023252337, + "learning_rate": 1.319560159387471e-05, + "loss": 0.6294, + "step": 5344 + }, + { + "epoch": 0.41, + "grad_norm": 1.0982733787029875, + "learning_rate": 1.3193220571362328e-05, + "loss": 0.561, + "step": 5345 + }, + { + "epoch": 0.41, + "grad_norm": 1.1329781772872085, + "learning_rate": 1.319083934724546e-05, + "loss": 0.5656, + "step": 5346 + }, + { + "epoch": 0.41, + "grad_norm": 1.2756605344661, + "learning_rate": 1.3188457921674443e-05, + "loss": 0.6146, + "step": 5347 + }, + { + "epoch": 0.41, + "grad_norm": 1.3251142596516732, + "learning_rate": 1.3186076294799624e-05, + "loss": 0.6147, + "step": 5348 + }, + { + "epoch": 0.41, + "grad_norm": 1.1498387928630491, + "learning_rate": 1.318369446677137e-05, + "loss": 0.5602, + "step": 5349 + }, + { + "epoch": 0.42, + "grad_norm": 1.2273020853198169, + "learning_rate": 1.3181312437740059e-05, + "loss": 0.5581, + "step": 5350 + }, + { + "epoch": 0.42, + "grad_norm": 1.2477606742217344, + "learning_rate": 1.3178930207856082e-05, + "loss": 0.6612, + "step": 5351 + }, + { + "epoch": 0.42, + "grad_norm": 1.1590274752634329, + "learning_rate": 1.3176547777269838e-05, + "loss": 0.5668, + "step": 5352 + }, + { + "epoch": 0.42, + "grad_norm": 1.1709371756030678, + "learning_rate": 1.3174165146131746e-05, + "loss": 0.5459, + "step": 5353 + }, + { + "epoch": 0.42, + "grad_norm": 1.1294575965118923, + "learning_rate": 1.317178231459223e-05, + "loss": 0.5451, + "step": 5354 + }, + { + "epoch": 0.42, + "grad_norm": 1.1875674579935054, + "learning_rate": 1.3169399282801731e-05, + "loss": 0.5504, + "step": 5355 + }, + { + "epoch": 0.42, + "grad_norm": 1.1710273219959833, + "learning_rate": 1.3167016050910709e-05, + "loss": 0.5638, + "step": 5356 + }, + { + "epoch": 0.42, + "grad_norm": 1.1267822770801894, + "learning_rate": 1.316463261906962e-05, + "loss": 0.5479, + "step": 5357 + }, + { + "epoch": 0.42, + "grad_norm": 1.0659092454120829, + "learning_rate": 1.3162248987428945e-05, + "loss": 0.5677, + "step": 5358 + }, + { + "epoch": 0.42, + "grad_norm": 1.077452684943488, + "learning_rate": 1.3159865156139177e-05, + "loss": 0.5328, + "step": 5359 + }, + { + "epoch": 0.42, + "grad_norm": 1.1878566708238993, + "learning_rate": 1.3157481125350819e-05, + "loss": 0.5938, + "step": 5360 + }, + { + "epoch": 0.42, + "grad_norm": 1.136143699516451, + "learning_rate": 1.315509689521439e-05, + "loss": 0.5728, + "step": 5361 + }, + { + "epoch": 0.42, + "grad_norm": 1.2602971811117896, + "learning_rate": 1.3152712465880415e-05, + "loss": 0.5448, + "step": 5362 + }, + { + "epoch": 0.42, + "grad_norm": 1.1265945260711152, + "learning_rate": 1.3150327837499433e-05, + "loss": 0.5356, + "step": 5363 + }, + { + "epoch": 0.42, + "grad_norm": 1.1729767261710076, + "learning_rate": 1.3147943010222001e-05, + "loss": 0.6101, + "step": 5364 + }, + { + "epoch": 0.42, + "grad_norm": 1.1346307828696363, + "learning_rate": 1.3145557984198689e-05, + "loss": 0.5428, + "step": 5365 + }, + { + "epoch": 0.42, + "grad_norm": 1.3409336650636199, + "learning_rate": 1.3143172759580072e-05, + "loss": 0.6448, + "step": 5366 + }, + { + "epoch": 0.42, + "grad_norm": 1.1121967052262058, + "learning_rate": 1.3140787336516743e-05, + "loss": 0.5277, + "step": 5367 + }, + { + "epoch": 0.42, + "grad_norm": 1.2291394516254261, + "learning_rate": 1.3138401715159301e-05, + "loss": 0.5759, + "step": 5368 + }, + { + "epoch": 0.42, + "grad_norm": 1.1726890788098938, + "learning_rate": 1.313601589565837e-05, + "loss": 0.6022, + "step": 5369 + }, + { + "epoch": 0.42, + "grad_norm": 1.1335142921452013, + "learning_rate": 1.3133629878164573e-05, + "loss": 0.5853, + "step": 5370 + }, + { + "epoch": 0.42, + "grad_norm": 1.1698067278867612, + "learning_rate": 1.3131243662828558e-05, + "loss": 0.5806, + "step": 5371 + }, + { + "epoch": 0.42, + "grad_norm": 1.2313468585184584, + "learning_rate": 1.3128857249800975e-05, + "loss": 0.5683, + "step": 5372 + }, + { + "epoch": 0.42, + "grad_norm": 1.251265409833926, + "learning_rate": 1.3126470639232487e-05, + "loss": 0.5452, + "step": 5373 + }, + { + "epoch": 0.42, + "grad_norm": 1.3091814685634704, + "learning_rate": 1.312408383127378e-05, + "loss": 0.6112, + "step": 5374 + }, + { + "epoch": 0.42, + "grad_norm": 1.237139924963708, + "learning_rate": 1.3121696826075542e-05, + "loss": 0.5878, + "step": 5375 + }, + { + "epoch": 0.42, + "grad_norm": 1.1172732340321048, + "learning_rate": 1.3119309623788479e-05, + "loss": 0.5318, + "step": 5376 + }, + { + "epoch": 0.42, + "grad_norm": 1.0744853295824257, + "learning_rate": 1.3116922224563306e-05, + "loss": 0.5582, + "step": 5377 + }, + { + "epoch": 0.42, + "grad_norm": 1.2414673451105471, + "learning_rate": 1.311453462855075e-05, + "loss": 0.5564, + "step": 5378 + }, + { + "epoch": 0.42, + "grad_norm": 1.154450537043692, + "learning_rate": 1.3112146835901552e-05, + "loss": 0.6046, + "step": 5379 + }, + { + "epoch": 0.42, + "grad_norm": 1.0977557805777245, + "learning_rate": 1.3109758846766472e-05, + "loss": 0.5358, + "step": 5380 + }, + { + "epoch": 0.42, + "grad_norm": 1.1126979437474374, + "learning_rate": 1.310737066129627e-05, + "loss": 0.555, + "step": 5381 + }, + { + "epoch": 0.42, + "grad_norm": 1.1137912050753107, + "learning_rate": 1.3104982279641727e-05, + "loss": 0.5925, + "step": 5382 + }, + { + "epoch": 0.42, + "grad_norm": 1.1289658580209436, + "learning_rate": 1.3102593701953632e-05, + "loss": 0.5432, + "step": 5383 + }, + { + "epoch": 0.42, + "grad_norm": 1.2640440686064511, + "learning_rate": 1.3100204928382787e-05, + "loss": 0.6193, + "step": 5384 + }, + { + "epoch": 0.42, + "grad_norm": 1.1640846711165926, + "learning_rate": 1.3097815959080013e-05, + "loss": 0.5981, + "step": 5385 + }, + { + "epoch": 0.42, + "grad_norm": 1.2061643481110838, + "learning_rate": 1.3095426794196136e-05, + "loss": 0.5771, + "step": 5386 + }, + { + "epoch": 0.42, + "grad_norm": 1.1018096430150346, + "learning_rate": 1.3093037433881995e-05, + "loss": 0.5421, + "step": 5387 + }, + { + "epoch": 0.42, + "grad_norm": 1.1087404504857186, + "learning_rate": 1.3090647878288441e-05, + "loss": 0.5428, + "step": 5388 + }, + { + "epoch": 0.42, + "grad_norm": 1.328570481928475, + "learning_rate": 1.3088258127566338e-05, + "loss": 0.6409, + "step": 5389 + }, + { + "epoch": 0.42, + "grad_norm": 1.2403104503457993, + "learning_rate": 1.3085868181866571e-05, + "loss": 0.598, + "step": 5390 + }, + { + "epoch": 0.42, + "grad_norm": 1.2386832081238086, + "learning_rate": 1.3083478041340023e-05, + "loss": 0.5799, + "step": 5391 + }, + { + "epoch": 0.42, + "grad_norm": 1.1597264068335127, + "learning_rate": 1.3081087706137596e-05, + "loss": 0.608, + "step": 5392 + }, + { + "epoch": 0.42, + "grad_norm": 1.1772125758332481, + "learning_rate": 1.307869717641021e-05, + "loss": 0.559, + "step": 5393 + }, + { + "epoch": 0.42, + "grad_norm": 1.0811032057766397, + "learning_rate": 1.3076306452308782e-05, + "loss": 0.5869, + "step": 5394 + }, + { + "epoch": 0.42, + "grad_norm": 1.205162403986662, + "learning_rate": 1.3073915533984262e-05, + "loss": 0.5905, + "step": 5395 + }, + { + "epoch": 0.42, + "grad_norm": 1.2341183262763087, + "learning_rate": 1.307152442158759e-05, + "loss": 0.613, + "step": 5396 + }, + { + "epoch": 0.42, + "grad_norm": 1.1664420263548732, + "learning_rate": 1.3069133115269734e-05, + "loss": 0.5878, + "step": 5397 + }, + { + "epoch": 0.42, + "grad_norm": 1.0628185355719757, + "learning_rate": 1.3066741615181675e-05, + "loss": 0.5476, + "step": 5398 + }, + { + "epoch": 0.42, + "grad_norm": 1.218004585605126, + "learning_rate": 1.306434992147439e-05, + "loss": 0.6091, + "step": 5399 + }, + { + "epoch": 0.42, + "grad_norm": 1.090991984274113, + "learning_rate": 1.306195803429889e-05, + "loss": 0.4964, + "step": 5400 + }, + { + "epoch": 0.42, + "grad_norm": 1.113897588008314, + "learning_rate": 1.3059565953806177e-05, + "loss": 0.5366, + "step": 5401 + }, + { + "epoch": 0.42, + "grad_norm": 1.2068524274488157, + "learning_rate": 1.305717368014728e-05, + "loss": 0.6072, + "step": 5402 + }, + { + "epoch": 0.42, + "grad_norm": 1.210839987489532, + "learning_rate": 1.3054781213473238e-05, + "loss": 0.5776, + "step": 5403 + }, + { + "epoch": 0.42, + "grad_norm": 1.1703435427695652, + "learning_rate": 1.3052388553935096e-05, + "loss": 0.5825, + "step": 5404 + }, + { + "epoch": 0.42, + "grad_norm": 1.168563844267776, + "learning_rate": 1.304999570168392e-05, + "loss": 0.5624, + "step": 5405 + }, + { + "epoch": 0.42, + "grad_norm": 1.2287186544306676, + "learning_rate": 1.3047602656870775e-05, + "loss": 0.5939, + "step": 5406 + }, + { + "epoch": 0.42, + "grad_norm": 1.2171640592186117, + "learning_rate": 1.3045209419646749e-05, + "loss": 0.594, + "step": 5407 + }, + { + "epoch": 0.42, + "grad_norm": 1.1286022476537598, + "learning_rate": 1.3042815990162944e-05, + "loss": 0.573, + "step": 5408 + }, + { + "epoch": 0.42, + "grad_norm": 1.1666646968734098, + "learning_rate": 1.3040422368570466e-05, + "loss": 0.5778, + "step": 5409 + }, + { + "epoch": 0.42, + "grad_norm": 1.1808447813898302, + "learning_rate": 1.3038028555020444e-05, + "loss": 0.5682, + "step": 5410 + }, + { + "epoch": 0.42, + "grad_norm": 1.1902489716674234, + "learning_rate": 1.3035634549663995e-05, + "loss": 0.5291, + "step": 5411 + }, + { + "epoch": 0.42, + "grad_norm": 1.1010248481863851, + "learning_rate": 1.3033240352652281e-05, + "loss": 0.5467, + "step": 5412 + }, + { + "epoch": 0.42, + "grad_norm": 1.274650557633351, + "learning_rate": 1.3030845964136452e-05, + "loss": 0.6078, + "step": 5413 + }, + { + "epoch": 0.42, + "grad_norm": 1.1680105960966254, + "learning_rate": 1.3028451384267679e-05, + "loss": 0.5619, + "step": 5414 + }, + { + "epoch": 0.42, + "grad_norm": 1.1307461011483286, + "learning_rate": 1.3026056613197151e-05, + "loss": 0.5746, + "step": 5415 + }, + { + "epoch": 0.42, + "grad_norm": 1.1679891118855061, + "learning_rate": 1.3023661651076051e-05, + "loss": 0.5522, + "step": 5416 + }, + { + "epoch": 0.42, + "grad_norm": 1.2030811549532923, + "learning_rate": 1.3021266498055592e-05, + "loss": 0.5574, + "step": 5417 + }, + { + "epoch": 0.42, + "grad_norm": 1.2734235517053876, + "learning_rate": 1.3018871154286991e-05, + "loss": 0.6376, + "step": 5418 + }, + { + "epoch": 0.42, + "grad_norm": 1.1515076270933733, + "learning_rate": 1.3016475619921477e-05, + "loss": 0.5837, + "step": 5419 + }, + { + "epoch": 0.42, + "grad_norm": 1.2972100985439499, + "learning_rate": 1.3014079895110299e-05, + "loss": 0.6417, + "step": 5420 + }, + { + "epoch": 0.42, + "grad_norm": 1.1155738647880546, + "learning_rate": 1.3011683980004705e-05, + "loss": 0.5376, + "step": 5421 + }, + { + "epoch": 0.42, + "grad_norm": 1.1662838625719312, + "learning_rate": 1.3009287874755963e-05, + "loss": 0.6136, + "step": 5422 + }, + { + "epoch": 0.42, + "grad_norm": 1.1438499740515875, + "learning_rate": 1.3006891579515351e-05, + "loss": 0.528, + "step": 5423 + }, + { + "epoch": 0.42, + "grad_norm": 1.1405953181337083, + "learning_rate": 1.3004495094434157e-05, + "loss": 0.571, + "step": 5424 + }, + { + "epoch": 0.42, + "grad_norm": 1.1564098453865295, + "learning_rate": 1.3002098419663692e-05, + "loss": 0.5731, + "step": 5425 + }, + { + "epoch": 0.42, + "grad_norm": 1.0746658781804426, + "learning_rate": 1.2999701555355264e-05, + "loss": 0.5333, + "step": 5426 + }, + { + "epoch": 0.42, + "grad_norm": 1.1580312257157257, + "learning_rate": 1.2997304501660197e-05, + "loss": 0.5383, + "step": 5427 + }, + { + "epoch": 0.42, + "grad_norm": 1.1553913227213064, + "learning_rate": 1.2994907258729835e-05, + "loss": 0.5779, + "step": 5428 + }, + { + "epoch": 0.42, + "grad_norm": 1.1828487053070678, + "learning_rate": 1.2992509826715525e-05, + "loss": 0.5551, + "step": 5429 + }, + { + "epoch": 0.42, + "grad_norm": 1.202424030957592, + "learning_rate": 1.2990112205768632e-05, + "loss": 0.6345, + "step": 5430 + }, + { + "epoch": 0.42, + "grad_norm": 1.2214735849829235, + "learning_rate": 1.2987714396040527e-05, + "loss": 0.6044, + "step": 5431 + }, + { + "epoch": 0.42, + "grad_norm": 1.1452872738005164, + "learning_rate": 1.2985316397682597e-05, + "loss": 0.5728, + "step": 5432 + }, + { + "epoch": 0.42, + "grad_norm": 1.097056486325496, + "learning_rate": 1.2982918210846243e-05, + "loss": 0.542, + "step": 5433 + }, + { + "epoch": 0.42, + "grad_norm": 1.077531125798468, + "learning_rate": 1.298051983568287e-05, + "loss": 0.5364, + "step": 5434 + }, + { + "epoch": 0.42, + "grad_norm": 1.2556836611923696, + "learning_rate": 1.2978121272343904e-05, + "loss": 0.5945, + "step": 5435 + }, + { + "epoch": 0.42, + "grad_norm": 1.165231003022694, + "learning_rate": 1.2975722520980777e-05, + "loss": 0.5832, + "step": 5436 + }, + { + "epoch": 0.42, + "grad_norm": 1.2382334510410542, + "learning_rate": 1.2973323581744935e-05, + "loss": 0.5825, + "step": 5437 + }, + { + "epoch": 0.42, + "grad_norm": 1.2646124286017335, + "learning_rate": 1.2970924454787834e-05, + "loss": 0.5928, + "step": 5438 + }, + { + "epoch": 0.42, + "grad_norm": 1.168861634741927, + "learning_rate": 1.2968525140260946e-05, + "loss": 0.6037, + "step": 5439 + }, + { + "epoch": 0.42, + "grad_norm": 1.072208034726781, + "learning_rate": 1.2966125638315745e-05, + "loss": 0.5404, + "step": 5440 + }, + { + "epoch": 0.42, + "grad_norm": 1.0987807517581774, + "learning_rate": 1.2963725949103734e-05, + "loss": 0.5726, + "step": 5441 + }, + { + "epoch": 0.42, + "grad_norm": 1.1791842062806022, + "learning_rate": 1.2961326072776412e-05, + "loss": 0.5786, + "step": 5442 + }, + { + "epoch": 0.42, + "grad_norm": 1.2925609394958855, + "learning_rate": 1.2958926009485297e-05, + "loss": 0.5977, + "step": 5443 + }, + { + "epoch": 0.42, + "grad_norm": 1.2435406686032486, + "learning_rate": 1.2956525759381917e-05, + "loss": 0.6286, + "step": 5444 + }, + { + "epoch": 0.42, + "grad_norm": 1.1867145148999685, + "learning_rate": 1.295412532261781e-05, + "loss": 0.542, + "step": 5445 + }, + { + "epoch": 0.42, + "grad_norm": 1.0957595348287095, + "learning_rate": 1.2951724699344532e-05, + "loss": 0.5126, + "step": 5446 + }, + { + "epoch": 0.42, + "grad_norm": 1.2980519664571566, + "learning_rate": 1.2949323889713643e-05, + "loss": 0.6512, + "step": 5447 + }, + { + "epoch": 0.42, + "grad_norm": 1.2518532366486848, + "learning_rate": 1.2946922893876722e-05, + "loss": 0.5531, + "step": 5448 + }, + { + "epoch": 0.42, + "grad_norm": 1.0681618572413925, + "learning_rate": 1.2944521711985357e-05, + "loss": 0.5069, + "step": 5449 + }, + { + "epoch": 0.42, + "grad_norm": 1.2524750048794988, + "learning_rate": 1.2942120344191141e-05, + "loss": 0.5952, + "step": 5450 + }, + { + "epoch": 0.42, + "grad_norm": 1.1794976087920306, + "learning_rate": 1.293971879064569e-05, + "loss": 0.5681, + "step": 5451 + }, + { + "epoch": 0.42, + "grad_norm": 1.0510496909846871, + "learning_rate": 1.2937317051500622e-05, + "loss": 0.5169, + "step": 5452 + }, + { + "epoch": 0.42, + "grad_norm": 1.2114186469301964, + "learning_rate": 1.2934915126907575e-05, + "loss": 0.5952, + "step": 5453 + }, + { + "epoch": 0.42, + "grad_norm": 1.1396633431604761, + "learning_rate": 1.2932513017018197e-05, + "loss": 0.5781, + "step": 5454 + }, + { + "epoch": 0.42, + "grad_norm": 1.1030882421318362, + "learning_rate": 1.293011072198414e-05, + "loss": 0.5656, + "step": 5455 + }, + { + "epoch": 0.42, + "grad_norm": 1.1908544544583044, + "learning_rate": 1.2927708241957077e-05, + "loss": 0.5282, + "step": 5456 + }, + { + "epoch": 0.42, + "grad_norm": 1.1783158873669861, + "learning_rate": 1.2925305577088687e-05, + "loss": 0.5524, + "step": 5457 + }, + { + "epoch": 0.42, + "grad_norm": 1.1286463982909718, + "learning_rate": 1.2922902727530663e-05, + "loss": 0.5551, + "step": 5458 + }, + { + "epoch": 0.42, + "grad_norm": 1.1902179733162859, + "learning_rate": 1.2920499693434712e-05, + "loss": 0.6409, + "step": 5459 + }, + { + "epoch": 0.42, + "grad_norm": 1.2854356548982162, + "learning_rate": 1.2918096474952544e-05, + "loss": 0.6034, + "step": 5460 + }, + { + "epoch": 0.42, + "grad_norm": 1.156583892451261, + "learning_rate": 1.2915693072235893e-05, + "loss": 0.5793, + "step": 5461 + }, + { + "epoch": 0.42, + "grad_norm": 1.2416669126591449, + "learning_rate": 1.2913289485436492e-05, + "loss": 0.5335, + "step": 5462 + }, + { + "epoch": 0.42, + "grad_norm": 1.1119004637883199, + "learning_rate": 1.2910885714706093e-05, + "loss": 0.5481, + "step": 5463 + }, + { + "epoch": 0.42, + "grad_norm": 1.2679661419948003, + "learning_rate": 1.290848176019647e-05, + "loss": 0.6347, + "step": 5464 + }, + { + "epoch": 0.42, + "grad_norm": 1.2130405990586628, + "learning_rate": 1.290607762205938e-05, + "loss": 0.6087, + "step": 5465 + }, + { + "epoch": 0.42, + "grad_norm": 1.1688536286948465, + "learning_rate": 1.2903673300446623e-05, + "loss": 0.5916, + "step": 5466 + }, + { + "epoch": 0.42, + "grad_norm": 1.303191646637205, + "learning_rate": 1.2901268795509982e-05, + "loss": 0.6474, + "step": 5467 + }, + { + "epoch": 0.42, + "grad_norm": 1.100066284870186, + "learning_rate": 1.2898864107401275e-05, + "loss": 0.5632, + "step": 5468 + }, + { + "epoch": 0.42, + "grad_norm": 1.2421815500176878, + "learning_rate": 1.2896459236272325e-05, + "loss": 0.5645, + "step": 5469 + }, + { + "epoch": 0.42, + "grad_norm": 1.1206938133238493, + "learning_rate": 1.2894054182274956e-05, + "loss": 0.5103, + "step": 5470 + }, + { + "epoch": 0.42, + "grad_norm": 1.1077585398199128, + "learning_rate": 1.2891648945561017e-05, + "loss": 0.5008, + "step": 5471 + }, + { + "epoch": 0.42, + "grad_norm": 1.185392969941593, + "learning_rate": 1.2889243526282357e-05, + "loss": 0.564, + "step": 5472 + }, + { + "epoch": 0.42, + "grad_norm": 1.1988177296873601, + "learning_rate": 1.288683792459085e-05, + "loss": 0.5892, + "step": 5473 + }, + { + "epoch": 0.42, + "grad_norm": 1.2340225972157672, + "learning_rate": 1.2884432140638375e-05, + "loss": 0.6147, + "step": 5474 + }, + { + "epoch": 0.42, + "grad_norm": 1.2384168869486223, + "learning_rate": 1.2882026174576812e-05, + "loss": 0.5768, + "step": 5475 + }, + { + "epoch": 0.42, + "grad_norm": 1.1702327665458885, + "learning_rate": 1.2879620026558067e-05, + "loss": 0.5321, + "step": 5476 + }, + { + "epoch": 0.42, + "grad_norm": 1.0680964005035958, + "learning_rate": 1.2877213696734052e-05, + "loss": 0.5685, + "step": 5477 + }, + { + "epoch": 0.42, + "grad_norm": 1.133198376408608, + "learning_rate": 1.2874807185256692e-05, + "loss": 0.5319, + "step": 5478 + }, + { + "epoch": 0.43, + "grad_norm": 1.2608793791241737, + "learning_rate": 1.2872400492277928e-05, + "loss": 0.5705, + "step": 5479 + }, + { + "epoch": 0.43, + "grad_norm": 1.3037042624770598, + "learning_rate": 1.2869993617949696e-05, + "loss": 0.6176, + "step": 5480 + }, + { + "epoch": 0.43, + "grad_norm": 1.2906044725166501, + "learning_rate": 1.2867586562423962e-05, + "loss": 0.5741, + "step": 5481 + }, + { + "epoch": 0.43, + "grad_norm": 1.1569897115541745, + "learning_rate": 1.2865179325852693e-05, + "loss": 0.5376, + "step": 5482 + }, + { + "epoch": 0.43, + "grad_norm": 1.2820102018108113, + "learning_rate": 1.2862771908387867e-05, + "loss": 0.6216, + "step": 5483 + }, + { + "epoch": 0.43, + "grad_norm": 1.2591156931585912, + "learning_rate": 1.2860364310181488e-05, + "loss": 0.5675, + "step": 5484 + }, + { + "epoch": 0.43, + "grad_norm": 1.1636384537255418, + "learning_rate": 1.2857956531385548e-05, + "loss": 0.579, + "step": 5485 + }, + { + "epoch": 0.43, + "grad_norm": 1.1578285679493767, + "learning_rate": 1.2855548572152066e-05, + "loss": 0.5645, + "step": 5486 + }, + { + "epoch": 0.43, + "grad_norm": 1.1476531172177928, + "learning_rate": 1.2853140432633074e-05, + "loss": 0.503, + "step": 5487 + }, + { + "epoch": 0.43, + "grad_norm": 1.2773189717260567, + "learning_rate": 1.2850732112980602e-05, + "loss": 0.5681, + "step": 5488 + }, + { + "epoch": 0.43, + "grad_norm": 1.2145730357881368, + "learning_rate": 1.2848323613346708e-05, + "loss": 0.5966, + "step": 5489 + }, + { + "epoch": 0.43, + "grad_norm": 1.1647737109736245, + "learning_rate": 1.2845914933883443e-05, + "loss": 0.6117, + "step": 5490 + }, + { + "epoch": 0.43, + "grad_norm": 1.1003217811702242, + "learning_rate": 1.2843506074742888e-05, + "loss": 0.5412, + "step": 5491 + }, + { + "epoch": 0.43, + "grad_norm": 1.202108870598096, + "learning_rate": 1.2841097036077125e-05, + "loss": 0.615, + "step": 5492 + }, + { + "epoch": 0.43, + "grad_norm": 1.1819249339173739, + "learning_rate": 1.283868781803825e-05, + "loss": 0.5932, + "step": 5493 + }, + { + "epoch": 0.43, + "grad_norm": 1.0883979641669264, + "learning_rate": 1.2836278420778366e-05, + "loss": 0.5207, + "step": 5494 + }, + { + "epoch": 0.43, + "grad_norm": 1.1347918351473658, + "learning_rate": 1.2833868844449588e-05, + "loss": 0.5725, + "step": 5495 + }, + { + "epoch": 0.43, + "grad_norm": 1.232446201767509, + "learning_rate": 1.283145908920405e-05, + "loss": 0.592, + "step": 5496 + }, + { + "epoch": 0.43, + "grad_norm": 1.3793116937930812, + "learning_rate": 1.2829049155193896e-05, + "loss": 0.6154, + "step": 5497 + }, + { + "epoch": 0.43, + "grad_norm": 1.246839533327724, + "learning_rate": 1.282663904257127e-05, + "loss": 0.6238, + "step": 5498 + }, + { + "epoch": 0.43, + "grad_norm": 1.0381274497630815, + "learning_rate": 1.2824228751488339e-05, + "loss": 0.505, + "step": 5499 + }, + { + "epoch": 0.43, + "grad_norm": 1.212783194435812, + "learning_rate": 1.2821818282097273e-05, + "loss": 0.5273, + "step": 5500 + }, + { + "epoch": 0.43, + "grad_norm": 1.1216896945427937, + "learning_rate": 1.281940763455026e-05, + "loss": 0.5618, + "step": 5501 + }, + { + "epoch": 0.43, + "grad_norm": 1.3120100378402657, + "learning_rate": 1.28169968089995e-05, + "loss": 0.5818, + "step": 5502 + }, + { + "epoch": 0.43, + "grad_norm": 1.1351507849670044, + "learning_rate": 1.2814585805597197e-05, + "loss": 0.5136, + "step": 5503 + }, + { + "epoch": 0.43, + "grad_norm": 1.2382589633031131, + "learning_rate": 1.2812174624495569e-05, + "loss": 0.5979, + "step": 5504 + }, + { + "epoch": 0.43, + "grad_norm": 1.1588280264088582, + "learning_rate": 1.2809763265846851e-05, + "loss": 0.5037, + "step": 5505 + }, + { + "epoch": 0.43, + "grad_norm": 1.1998168249197858, + "learning_rate": 1.2807351729803277e-05, + "loss": 0.5475, + "step": 5506 + }, + { + "epoch": 0.43, + "grad_norm": 1.2553727555384517, + "learning_rate": 1.280494001651711e-05, + "loss": 0.6203, + "step": 5507 + }, + { + "epoch": 0.43, + "grad_norm": 1.2009800703537514, + "learning_rate": 1.2802528126140604e-05, + "loss": 0.6095, + "step": 5508 + }, + { + "epoch": 0.43, + "grad_norm": 1.2123136514830408, + "learning_rate": 1.2800116058826037e-05, + "loss": 0.6154, + "step": 5509 + }, + { + "epoch": 0.43, + "grad_norm": 1.1539214632326678, + "learning_rate": 1.2797703814725702e-05, + "loss": 0.576, + "step": 5510 + }, + { + "epoch": 0.43, + "grad_norm": 1.1659747297057943, + "learning_rate": 1.2795291393991885e-05, + "loss": 0.6005, + "step": 5511 + }, + { + "epoch": 0.43, + "grad_norm": 1.1356948488428176, + "learning_rate": 1.2792878796776904e-05, + "loss": 0.6047, + "step": 5512 + }, + { + "epoch": 0.43, + "grad_norm": 1.085219605630756, + "learning_rate": 1.2790466023233075e-05, + "loss": 0.5306, + "step": 5513 + }, + { + "epoch": 0.43, + "grad_norm": 1.2194775586036035, + "learning_rate": 1.2788053073512728e-05, + "loss": 0.6436, + "step": 5514 + }, + { + "epoch": 0.43, + "grad_norm": 1.1882131342579478, + "learning_rate": 1.2785639947768207e-05, + "loss": 0.5168, + "step": 5515 + }, + { + "epoch": 0.43, + "grad_norm": 1.133888943369136, + "learning_rate": 1.2783226646151863e-05, + "loss": 0.5642, + "step": 5516 + }, + { + "epoch": 0.43, + "grad_norm": 1.153933860123063, + "learning_rate": 1.2780813168816062e-05, + "loss": 0.5221, + "step": 5517 + }, + { + "epoch": 0.43, + "grad_norm": 1.1556269023543317, + "learning_rate": 1.2778399515913177e-05, + "loss": 0.5666, + "step": 5518 + }, + { + "epoch": 0.43, + "grad_norm": 1.175304886646964, + "learning_rate": 1.2775985687595598e-05, + "loss": 0.5668, + "step": 5519 + }, + { + "epoch": 0.43, + "grad_norm": 1.1536244145709227, + "learning_rate": 1.277357168401572e-05, + "loss": 0.4967, + "step": 5520 + }, + { + "epoch": 0.43, + "grad_norm": 1.1527478560173565, + "learning_rate": 1.2771157505325951e-05, + "loss": 0.5899, + "step": 5521 + }, + { + "epoch": 0.43, + "grad_norm": 1.133982560553497, + "learning_rate": 1.2768743151678711e-05, + "loss": 0.5354, + "step": 5522 + }, + { + "epoch": 0.43, + "grad_norm": 1.19549359713714, + "learning_rate": 1.2766328623226434e-05, + "loss": 0.59, + "step": 5523 + }, + { + "epoch": 0.43, + "grad_norm": 1.160543820156621, + "learning_rate": 1.2763913920121554e-05, + "loss": 0.5791, + "step": 5524 + }, + { + "epoch": 0.43, + "grad_norm": 1.2247717352747132, + "learning_rate": 1.2761499042516531e-05, + "loss": 0.5581, + "step": 5525 + }, + { + "epoch": 0.43, + "grad_norm": 1.0866881595916071, + "learning_rate": 1.2759083990563825e-05, + "loss": 0.4941, + "step": 5526 + }, + { + "epoch": 0.43, + "grad_norm": 1.1619143538764027, + "learning_rate": 1.2756668764415913e-05, + "loss": 0.5733, + "step": 5527 + }, + { + "epoch": 0.43, + "grad_norm": 1.0770966974802965, + "learning_rate": 1.275425336422528e-05, + "loss": 0.5467, + "step": 5528 + }, + { + "epoch": 0.43, + "grad_norm": 1.1545138340745242, + "learning_rate": 1.2751837790144419e-05, + "loss": 0.6053, + "step": 5529 + }, + { + "epoch": 0.43, + "grad_norm": 1.1765427768872843, + "learning_rate": 1.2749422042325846e-05, + "loss": 0.5294, + "step": 5530 + }, + { + "epoch": 0.43, + "grad_norm": 1.301201921320522, + "learning_rate": 1.2747006120922068e-05, + "loss": 0.6116, + "step": 5531 + }, + { + "epoch": 0.43, + "grad_norm": 1.1333565819450089, + "learning_rate": 1.2744590026085622e-05, + "loss": 0.5351, + "step": 5532 + }, + { + "epoch": 0.43, + "grad_norm": 1.2013959970184338, + "learning_rate": 1.2742173757969052e-05, + "loss": 0.5668, + "step": 5533 + }, + { + "epoch": 0.43, + "grad_norm": 1.1777583130891491, + "learning_rate": 1.2739757316724901e-05, + "loss": 0.521, + "step": 5534 + }, + { + "epoch": 0.43, + "grad_norm": 1.215725790125941, + "learning_rate": 1.2737340702505737e-05, + "loss": 0.5632, + "step": 5535 + }, + { + "epoch": 0.43, + "grad_norm": 1.22141585638205, + "learning_rate": 1.273492391546413e-05, + "loss": 0.5806, + "step": 5536 + }, + { + "epoch": 0.43, + "grad_norm": 1.1856712012169837, + "learning_rate": 1.2732506955752665e-05, + "loss": 0.5805, + "step": 5537 + }, + { + "epoch": 0.43, + "grad_norm": 1.1348465646054706, + "learning_rate": 1.2730089823523943e-05, + "loss": 0.564, + "step": 5538 + }, + { + "epoch": 0.43, + "grad_norm": 1.3781617832248785, + "learning_rate": 1.2727672518930561e-05, + "loss": 0.6869, + "step": 5539 + }, + { + "epoch": 0.43, + "grad_norm": 1.1141266942087853, + "learning_rate": 1.2725255042125142e-05, + "loss": 0.5721, + "step": 5540 + }, + { + "epoch": 0.43, + "grad_norm": 1.1295708412728245, + "learning_rate": 1.2722837393260308e-05, + "loss": 0.5473, + "step": 5541 + }, + { + "epoch": 0.43, + "grad_norm": 1.19208789774343, + "learning_rate": 1.2720419572488705e-05, + "loss": 0.5541, + "step": 5542 + }, + { + "epoch": 0.43, + "grad_norm": 1.2233611198737826, + "learning_rate": 1.2718001579962978e-05, + "loss": 0.5988, + "step": 5543 + }, + { + "epoch": 0.43, + "grad_norm": 1.2402776275061307, + "learning_rate": 1.2715583415835788e-05, + "loss": 0.6028, + "step": 5544 + }, + { + "epoch": 0.43, + "grad_norm": 1.0926991455351278, + "learning_rate": 1.2713165080259805e-05, + "loss": 0.5535, + "step": 5545 + }, + { + "epoch": 0.43, + "grad_norm": 1.1117348622208707, + "learning_rate": 1.2710746573387716e-05, + "loss": 0.5454, + "step": 5546 + }, + { + "epoch": 0.43, + "grad_norm": 1.1983621128448163, + "learning_rate": 1.2708327895372208e-05, + "loss": 0.5822, + "step": 5547 + }, + { + "epoch": 0.43, + "grad_norm": 1.0697513975137425, + "learning_rate": 1.2705909046365987e-05, + "loss": 0.5298, + "step": 5548 + }, + { + "epoch": 0.43, + "grad_norm": 1.1547100534675536, + "learning_rate": 1.2703490026521766e-05, + "loss": 0.521, + "step": 5549 + }, + { + "epoch": 0.43, + "grad_norm": 1.1326668481255966, + "learning_rate": 1.2701070835992273e-05, + "loss": 0.5192, + "step": 5550 + }, + { + "epoch": 0.43, + "grad_norm": 1.0812013383620682, + "learning_rate": 1.2698651474930239e-05, + "loss": 0.567, + "step": 5551 + }, + { + "epoch": 0.43, + "grad_norm": 1.1635503986592208, + "learning_rate": 1.269623194348842e-05, + "loss": 0.586, + "step": 5552 + }, + { + "epoch": 0.43, + "grad_norm": 1.1103739471997414, + "learning_rate": 1.2693812241819565e-05, + "loss": 0.508, + "step": 5553 + }, + { + "epoch": 0.43, + "grad_norm": 1.3577002370274034, + "learning_rate": 1.2691392370076443e-05, + "loss": 0.6356, + "step": 5554 + }, + { + "epoch": 0.43, + "grad_norm": 1.099004982535233, + "learning_rate": 1.2688972328411836e-05, + "loss": 0.5487, + "step": 5555 + }, + { + "epoch": 0.43, + "grad_norm": 1.31773268886971, + "learning_rate": 1.2686552116978535e-05, + "loss": 0.6177, + "step": 5556 + }, + { + "epoch": 0.43, + "grad_norm": 1.1867376691403835, + "learning_rate": 1.2684131735929337e-05, + "loss": 0.5549, + "step": 5557 + }, + { + "epoch": 0.43, + "grad_norm": 1.1766972316486015, + "learning_rate": 1.2681711185417053e-05, + "loss": 0.5532, + "step": 5558 + }, + { + "epoch": 0.43, + "grad_norm": 1.1879420210693632, + "learning_rate": 1.2679290465594507e-05, + "loss": 0.5639, + "step": 5559 + }, + { + "epoch": 0.43, + "grad_norm": 1.1284058197569167, + "learning_rate": 1.2676869576614527e-05, + "loss": 0.5732, + "step": 5560 + }, + { + "epoch": 0.43, + "grad_norm": 1.1950623992366058, + "learning_rate": 1.2674448518629964e-05, + "loss": 0.5867, + "step": 5561 + }, + { + "epoch": 0.43, + "grad_norm": 1.2503843193526816, + "learning_rate": 1.2672027291793669e-05, + "loss": 0.6329, + "step": 5562 + }, + { + "epoch": 0.43, + "grad_norm": 1.1781078651052352, + "learning_rate": 1.2669605896258503e-05, + "loss": 0.5366, + "step": 5563 + }, + { + "epoch": 0.43, + "grad_norm": 1.1897859655764778, + "learning_rate": 1.2667184332177342e-05, + "loss": 0.5648, + "step": 5564 + }, + { + "epoch": 0.43, + "grad_norm": 1.254836690923467, + "learning_rate": 1.2664762599703073e-05, + "loss": 0.6093, + "step": 5565 + }, + { + "epoch": 0.43, + "grad_norm": 1.2439129439922414, + "learning_rate": 1.2662340698988595e-05, + "loss": 0.5668, + "step": 5566 + }, + { + "epoch": 0.43, + "grad_norm": 1.142804037077906, + "learning_rate": 1.2659918630186814e-05, + "loss": 0.5681, + "step": 5567 + }, + { + "epoch": 0.43, + "grad_norm": 1.1572250173051393, + "learning_rate": 1.2657496393450646e-05, + "loss": 0.5647, + "step": 5568 + }, + { + "epoch": 0.43, + "grad_norm": 1.1247183129196163, + "learning_rate": 1.2655073988933016e-05, + "loss": 0.5784, + "step": 5569 + }, + { + "epoch": 0.43, + "grad_norm": 1.1487916672780292, + "learning_rate": 1.2652651416786867e-05, + "loss": 0.5301, + "step": 5570 + }, + { + "epoch": 0.43, + "grad_norm": 1.2677035277042292, + "learning_rate": 1.2650228677165153e-05, + "loss": 0.593, + "step": 5571 + }, + { + "epoch": 0.43, + "grad_norm": 1.145158302991299, + "learning_rate": 1.2647805770220826e-05, + "loss": 0.5562, + "step": 5572 + }, + { + "epoch": 0.43, + "grad_norm": 1.1512460944226481, + "learning_rate": 1.264538269610686e-05, + "loss": 0.5712, + "step": 5573 + }, + { + "epoch": 0.43, + "grad_norm": 1.1257718934961527, + "learning_rate": 1.2642959454976236e-05, + "loss": 0.5677, + "step": 5574 + }, + { + "epoch": 0.43, + "grad_norm": 1.2000249581920426, + "learning_rate": 1.2640536046981943e-05, + "loss": 0.5582, + "step": 5575 + }, + { + "epoch": 0.43, + "grad_norm": 1.2453488123253045, + "learning_rate": 1.263811247227699e-05, + "loss": 0.5999, + "step": 5576 + }, + { + "epoch": 0.43, + "grad_norm": 1.2416045542463858, + "learning_rate": 1.2635688731014386e-05, + "loss": 0.5865, + "step": 5577 + }, + { + "epoch": 0.43, + "grad_norm": 1.110429880136358, + "learning_rate": 1.2633264823347156e-05, + "loss": 0.5823, + "step": 5578 + }, + { + "epoch": 0.43, + "grad_norm": 1.2268713786945737, + "learning_rate": 1.2630840749428327e-05, + "loss": 0.5936, + "step": 5579 + }, + { + "epoch": 0.43, + "grad_norm": 1.1897675798547742, + "learning_rate": 1.2628416509410947e-05, + "loss": 0.6204, + "step": 5580 + }, + { + "epoch": 0.43, + "grad_norm": 1.1945103465734335, + "learning_rate": 1.2625992103448077e-05, + "loss": 0.584, + "step": 5581 + }, + { + "epoch": 0.43, + "grad_norm": 1.129620547177039, + "learning_rate": 1.2623567531692774e-05, + "loss": 0.5422, + "step": 5582 + }, + { + "epoch": 0.43, + "grad_norm": 1.106649139296138, + "learning_rate": 1.2621142794298118e-05, + "loss": 0.5495, + "step": 5583 + }, + { + "epoch": 0.43, + "grad_norm": 1.2286843091571198, + "learning_rate": 1.2618717891417194e-05, + "loss": 0.5841, + "step": 5584 + }, + { + "epoch": 0.43, + "grad_norm": 1.2267494302192081, + "learning_rate": 1.2616292823203098e-05, + "loss": 0.5958, + "step": 5585 + }, + { + "epoch": 0.43, + "grad_norm": 1.1938057172464853, + "learning_rate": 1.2613867589808939e-05, + "loss": 0.5393, + "step": 5586 + }, + { + "epoch": 0.43, + "grad_norm": 1.253108260882443, + "learning_rate": 1.2611442191387836e-05, + "loss": 0.5691, + "step": 5587 + }, + { + "epoch": 0.43, + "grad_norm": 1.2112076488763577, + "learning_rate": 1.2609016628092907e-05, + "loss": 0.5426, + "step": 5588 + }, + { + "epoch": 0.43, + "grad_norm": 1.1524016802176051, + "learning_rate": 1.2606590900077303e-05, + "loss": 0.5564, + "step": 5589 + }, + { + "epoch": 0.43, + "grad_norm": 1.1014883679274898, + "learning_rate": 1.2604165007494164e-05, + "loss": 0.5093, + "step": 5590 + }, + { + "epoch": 0.43, + "grad_norm": 1.9059208445048783, + "learning_rate": 1.2601738950496654e-05, + "loss": 0.5726, + "step": 5591 + }, + { + "epoch": 0.43, + "grad_norm": 1.0752109453691523, + "learning_rate": 1.2599312729237943e-05, + "loss": 0.5657, + "step": 5592 + }, + { + "epoch": 0.43, + "grad_norm": 1.1366389925003257, + "learning_rate": 1.2596886343871204e-05, + "loss": 0.5124, + "step": 5593 + }, + { + "epoch": 0.43, + "grad_norm": 1.1415359830661342, + "learning_rate": 1.2594459794549636e-05, + "loss": 0.5544, + "step": 5594 + }, + { + "epoch": 0.43, + "grad_norm": 1.2531306640040374, + "learning_rate": 1.2592033081426434e-05, + "loss": 0.5962, + "step": 5595 + }, + { + "epoch": 0.43, + "grad_norm": 1.0777766452752768, + "learning_rate": 1.2589606204654809e-05, + "loss": 0.5125, + "step": 5596 + }, + { + "epoch": 0.43, + "grad_norm": 1.1206248828145364, + "learning_rate": 1.2587179164387987e-05, + "loss": 0.5679, + "step": 5597 + }, + { + "epoch": 0.43, + "grad_norm": 1.157364849085201, + "learning_rate": 1.2584751960779192e-05, + "loss": 0.5925, + "step": 5598 + }, + { + "epoch": 0.43, + "grad_norm": 1.1152819407886694, + "learning_rate": 1.2582324593981673e-05, + "loss": 0.5686, + "step": 5599 + }, + { + "epoch": 0.43, + "grad_norm": 1.298006047109851, + "learning_rate": 1.2579897064148678e-05, + "loss": 0.5408, + "step": 5600 + }, + { + "epoch": 0.43, + "grad_norm": 1.162330619203509, + "learning_rate": 1.2577469371433473e-05, + "loss": 0.5657, + "step": 5601 + }, + { + "epoch": 0.43, + "grad_norm": 1.1326334318498235, + "learning_rate": 1.2575041515989328e-05, + "loss": 0.5385, + "step": 5602 + }, + { + "epoch": 0.43, + "grad_norm": 1.2027095721500707, + "learning_rate": 1.2572613497969524e-05, + "loss": 0.5648, + "step": 5603 + }, + { + "epoch": 0.43, + "grad_norm": 1.2390831598048317, + "learning_rate": 1.257018531752736e-05, + "loss": 0.5448, + "step": 5604 + }, + { + "epoch": 0.43, + "grad_norm": 1.2188423806415503, + "learning_rate": 1.2567756974816134e-05, + "loss": 0.5245, + "step": 5605 + }, + { + "epoch": 0.43, + "grad_norm": 1.0975876104539766, + "learning_rate": 1.2565328469989165e-05, + "loss": 0.5449, + "step": 5606 + }, + { + "epoch": 0.43, + "grad_norm": 1.1708930924574363, + "learning_rate": 1.2562899803199773e-05, + "loss": 0.626, + "step": 5607 + }, + { + "epoch": 0.44, + "grad_norm": 1.2815041638863705, + "learning_rate": 1.2560470974601294e-05, + "loss": 0.6245, + "step": 5608 + }, + { + "epoch": 0.44, + "grad_norm": 1.2052114156399512, + "learning_rate": 1.255804198434707e-05, + "loss": 0.5591, + "step": 5609 + }, + { + "epoch": 0.44, + "grad_norm": 1.1299411029995607, + "learning_rate": 1.2555612832590458e-05, + "loss": 0.5619, + "step": 5610 + }, + { + "epoch": 0.44, + "grad_norm": 1.3317409881521767, + "learning_rate": 1.2553183519484826e-05, + "loss": 0.6291, + "step": 5611 + }, + { + "epoch": 0.44, + "grad_norm": 1.0300141645587158, + "learning_rate": 1.2550754045183544e-05, + "loss": 0.5636, + "step": 5612 + }, + { + "epoch": 0.44, + "grad_norm": 1.0895233230160215, + "learning_rate": 1.2548324409839999e-05, + "loss": 0.5437, + "step": 5613 + }, + { + "epoch": 0.44, + "grad_norm": 1.2064299829641423, + "learning_rate": 1.2545894613607585e-05, + "loss": 0.6266, + "step": 5614 + }, + { + "epoch": 0.44, + "grad_norm": 1.2018781864879817, + "learning_rate": 1.254346465663971e-05, + "loss": 0.6192, + "step": 5615 + }, + { + "epoch": 0.44, + "grad_norm": 1.288902272756921, + "learning_rate": 1.254103453908979e-05, + "loss": 0.6152, + "step": 5616 + }, + { + "epoch": 0.44, + "grad_norm": 1.189199335994593, + "learning_rate": 1.2538604261111247e-05, + "loss": 0.6015, + "step": 5617 + }, + { + "epoch": 0.44, + "grad_norm": 1.0927091823364878, + "learning_rate": 1.253617382285752e-05, + "loss": 0.5194, + "step": 5618 + }, + { + "epoch": 0.44, + "grad_norm": 1.1990401541959328, + "learning_rate": 1.2533743224482055e-05, + "loss": 0.5738, + "step": 5619 + }, + { + "epoch": 0.44, + "grad_norm": 1.2292325546364247, + "learning_rate": 1.253131246613831e-05, + "loss": 0.5667, + "step": 5620 + }, + { + "epoch": 0.44, + "grad_norm": 1.1838606731718777, + "learning_rate": 1.2528881547979748e-05, + "loss": 0.5645, + "step": 5621 + }, + { + "epoch": 0.44, + "grad_norm": 1.133026471154515, + "learning_rate": 1.2526450470159845e-05, + "loss": 0.5647, + "step": 5622 + }, + { + "epoch": 0.44, + "grad_norm": 1.0842670244002264, + "learning_rate": 1.2524019232832089e-05, + "loss": 0.5356, + "step": 5623 + }, + { + "epoch": 0.44, + "grad_norm": 1.1829979533212456, + "learning_rate": 1.252158783614998e-05, + "loss": 0.6011, + "step": 5624 + }, + { + "epoch": 0.44, + "grad_norm": 1.2348087852342216, + "learning_rate": 1.2519156280267017e-05, + "loss": 0.602, + "step": 5625 + }, + { + "epoch": 0.44, + "grad_norm": 1.1313194169301475, + "learning_rate": 1.2516724565336724e-05, + "loss": 0.5417, + "step": 5626 + }, + { + "epoch": 0.44, + "grad_norm": 1.2771990399531177, + "learning_rate": 1.2514292691512624e-05, + "loss": 0.634, + "step": 5627 + }, + { + "epoch": 0.44, + "grad_norm": 1.2768444053506123, + "learning_rate": 1.2511860658948252e-05, + "loss": 0.5424, + "step": 5628 + }, + { + "epoch": 0.44, + "grad_norm": 1.0379950985140463, + "learning_rate": 1.250942846779716e-05, + "loss": 0.5099, + "step": 5629 + }, + { + "epoch": 0.44, + "grad_norm": 1.1584306721760087, + "learning_rate": 1.2506996118212897e-05, + "loss": 0.5554, + "step": 5630 + }, + { + "epoch": 0.44, + "grad_norm": 1.2251718011391377, + "learning_rate": 1.250456361034904e-05, + "loss": 0.6173, + "step": 5631 + }, + { + "epoch": 0.44, + "grad_norm": 1.2020889379061062, + "learning_rate": 1.2502130944359161e-05, + "loss": 0.5743, + "step": 5632 + }, + { + "epoch": 0.44, + "grad_norm": 1.221823460499267, + "learning_rate": 1.249969812039684e-05, + "loss": 0.5637, + "step": 5633 + }, + { + "epoch": 0.44, + "grad_norm": 1.079684139086382, + "learning_rate": 1.2497265138615686e-05, + "loss": 0.538, + "step": 5634 + }, + { + "epoch": 0.44, + "grad_norm": 1.2561091386763716, + "learning_rate": 1.2494831999169296e-05, + "loss": 0.5183, + "step": 5635 + }, + { + "epoch": 0.44, + "grad_norm": 1.2243682283191835, + "learning_rate": 1.2492398702211293e-05, + "loss": 0.5708, + "step": 5636 + }, + { + "epoch": 0.44, + "grad_norm": 1.1549289476742823, + "learning_rate": 1.2489965247895302e-05, + "loss": 0.5946, + "step": 5637 + }, + { + "epoch": 0.44, + "grad_norm": 1.0833222987762312, + "learning_rate": 1.2487531636374954e-05, + "loss": 0.5671, + "step": 5638 + }, + { + "epoch": 0.44, + "grad_norm": 1.261160712356664, + "learning_rate": 1.24850978678039e-05, + "loss": 0.5592, + "step": 5639 + }, + { + "epoch": 0.44, + "grad_norm": 1.2041147792861158, + "learning_rate": 1.2482663942335798e-05, + "loss": 0.5817, + "step": 5640 + }, + { + "epoch": 0.44, + "grad_norm": 1.0903270019196591, + "learning_rate": 1.2480229860124313e-05, + "loss": 0.5514, + "step": 5641 + }, + { + "epoch": 0.44, + "grad_norm": 1.2564350428399476, + "learning_rate": 1.2477795621323121e-05, + "loss": 0.5771, + "step": 5642 + }, + { + "epoch": 0.44, + "grad_norm": 1.164034849516926, + "learning_rate": 1.2475361226085907e-05, + "loss": 0.5226, + "step": 5643 + }, + { + "epoch": 0.44, + "grad_norm": 1.2202677934690032, + "learning_rate": 1.2472926674566366e-05, + "loss": 0.5313, + "step": 5644 + }, + { + "epoch": 0.44, + "grad_norm": 1.1276619465438138, + "learning_rate": 1.2470491966918205e-05, + "loss": 0.5251, + "step": 5645 + }, + { + "epoch": 0.44, + "grad_norm": 1.206963299563885, + "learning_rate": 1.2468057103295144e-05, + "loss": 0.5663, + "step": 5646 + }, + { + "epoch": 0.44, + "grad_norm": 1.1818964405424834, + "learning_rate": 1.2465622083850903e-05, + "loss": 0.5999, + "step": 5647 + }, + { + "epoch": 0.44, + "grad_norm": 1.3265614450043886, + "learning_rate": 1.2463186908739217e-05, + "loss": 0.593, + "step": 5648 + }, + { + "epoch": 0.44, + "grad_norm": 1.1676976836663495, + "learning_rate": 1.2460751578113832e-05, + "loss": 0.5849, + "step": 5649 + }, + { + "epoch": 0.44, + "grad_norm": 1.1147956586488383, + "learning_rate": 1.2458316092128509e-05, + "loss": 0.5933, + "step": 5650 + }, + { + "epoch": 0.44, + "grad_norm": 1.1741671715205786, + "learning_rate": 1.2455880450937006e-05, + "loss": 0.5776, + "step": 5651 + }, + { + "epoch": 0.44, + "grad_norm": 1.1617512640029863, + "learning_rate": 1.2453444654693099e-05, + "loss": 0.5895, + "step": 5652 + }, + { + "epoch": 0.44, + "grad_norm": 1.1563366522409184, + "learning_rate": 1.245100870355057e-05, + "loss": 0.5467, + "step": 5653 + }, + { + "epoch": 0.44, + "grad_norm": 1.1359980025307461, + "learning_rate": 1.2448572597663218e-05, + "loss": 0.5751, + "step": 5654 + }, + { + "epoch": 0.44, + "grad_norm": 1.0829127057541117, + "learning_rate": 1.2446136337184847e-05, + "loss": 0.5796, + "step": 5655 + }, + { + "epoch": 0.44, + "grad_norm": 1.1313118828133837, + "learning_rate": 1.2443699922269268e-05, + "loss": 0.5241, + "step": 5656 + }, + { + "epoch": 0.44, + "grad_norm": 1.2017290513347676, + "learning_rate": 1.2441263353070305e-05, + "loss": 0.6428, + "step": 5657 + }, + { + "epoch": 0.44, + "grad_norm": 1.2297537527692084, + "learning_rate": 1.243882662974179e-05, + "loss": 0.5594, + "step": 5658 + }, + { + "epoch": 0.44, + "grad_norm": 1.2139172716311857, + "learning_rate": 1.2436389752437565e-05, + "loss": 0.5995, + "step": 5659 + }, + { + "epoch": 0.44, + "grad_norm": 1.1963637970916101, + "learning_rate": 1.243395272131149e-05, + "loss": 0.5927, + "step": 5660 + }, + { + "epoch": 0.44, + "grad_norm": 1.1619556998575276, + "learning_rate": 1.243151553651742e-05, + "loss": 0.5908, + "step": 5661 + }, + { + "epoch": 0.44, + "grad_norm": 1.136419512502045, + "learning_rate": 1.2429078198209227e-05, + "loss": 0.6033, + "step": 5662 + }, + { + "epoch": 0.44, + "grad_norm": 1.2015794512243771, + "learning_rate": 1.2426640706540796e-05, + "loss": 0.6205, + "step": 5663 + }, + { + "epoch": 0.44, + "grad_norm": 1.2196878101799054, + "learning_rate": 1.2424203061666018e-05, + "loss": 0.6196, + "step": 5664 + }, + { + "epoch": 0.44, + "grad_norm": 1.1001492897510252, + "learning_rate": 1.2421765263738795e-05, + "loss": 0.5458, + "step": 5665 + }, + { + "epoch": 0.44, + "grad_norm": 1.1485767409842995, + "learning_rate": 1.2419327312913034e-05, + "loss": 0.5601, + "step": 5666 + }, + { + "epoch": 0.44, + "grad_norm": 1.2697973353963758, + "learning_rate": 1.2416889209342658e-05, + "loss": 0.6374, + "step": 5667 + }, + { + "epoch": 0.44, + "grad_norm": 1.1694339505316522, + "learning_rate": 1.2414450953181598e-05, + "loss": 0.5717, + "step": 5668 + }, + { + "epoch": 0.44, + "grad_norm": 1.2132129578735729, + "learning_rate": 1.2412012544583791e-05, + "loss": 0.5539, + "step": 5669 + }, + { + "epoch": 0.44, + "grad_norm": 1.1065480924377782, + "learning_rate": 1.2409573983703189e-05, + "loss": 0.5327, + "step": 5670 + }, + { + "epoch": 0.44, + "grad_norm": 1.2059210452271647, + "learning_rate": 1.2407135270693748e-05, + "loss": 0.5784, + "step": 5671 + }, + { + "epoch": 0.44, + "grad_norm": 1.1985990930109207, + "learning_rate": 1.240469640570944e-05, + "loss": 0.5868, + "step": 5672 + }, + { + "epoch": 0.44, + "grad_norm": 1.2559529650804542, + "learning_rate": 1.240225738890424e-05, + "loss": 0.6066, + "step": 5673 + }, + { + "epoch": 0.44, + "grad_norm": 1.0678661819017123, + "learning_rate": 1.2399818220432136e-05, + "loss": 0.5225, + "step": 5674 + }, + { + "epoch": 0.44, + "grad_norm": 1.1958938880147316, + "learning_rate": 1.239737890044713e-05, + "loss": 0.5535, + "step": 5675 + }, + { + "epoch": 0.44, + "grad_norm": 1.1642319216742743, + "learning_rate": 1.2394939429103224e-05, + "loss": 0.5682, + "step": 5676 + }, + { + "epoch": 0.44, + "grad_norm": 1.1415536314053027, + "learning_rate": 1.2392499806554433e-05, + "loss": 0.5845, + "step": 5677 + }, + { + "epoch": 0.44, + "grad_norm": 1.1686232146174917, + "learning_rate": 1.2390060032954787e-05, + "loss": 0.5718, + "step": 5678 + }, + { + "epoch": 0.44, + "grad_norm": 1.214916950731231, + "learning_rate": 1.2387620108458318e-05, + "loss": 0.5272, + "step": 5679 + }, + { + "epoch": 0.44, + "grad_norm": 1.1916406119550509, + "learning_rate": 1.2385180033219077e-05, + "loss": 0.5934, + "step": 5680 + }, + { + "epoch": 0.44, + "grad_norm": 1.1582562850923244, + "learning_rate": 1.2382739807391113e-05, + "loss": 0.5916, + "step": 5681 + }, + { + "epoch": 0.44, + "grad_norm": 1.1741218389848078, + "learning_rate": 1.238029943112849e-05, + "loss": 0.51, + "step": 5682 + }, + { + "epoch": 0.44, + "grad_norm": 0.9802493575642978, + "learning_rate": 1.2377858904585284e-05, + "loss": 0.5269, + "step": 5683 + }, + { + "epoch": 0.44, + "grad_norm": 1.252618336217392, + "learning_rate": 1.237541822791558e-05, + "loss": 0.5678, + "step": 5684 + }, + { + "epoch": 0.44, + "grad_norm": 1.0191928111788477, + "learning_rate": 1.2372977401273465e-05, + "loss": 0.535, + "step": 5685 + }, + { + "epoch": 0.44, + "grad_norm": 1.1349284437704643, + "learning_rate": 1.2370536424813044e-05, + "loss": 0.5501, + "step": 5686 + }, + { + "epoch": 0.44, + "grad_norm": 1.1382052739851187, + "learning_rate": 1.2368095298688428e-05, + "loss": 0.535, + "step": 5687 + }, + { + "epoch": 0.44, + "grad_norm": 1.134629259434978, + "learning_rate": 1.236565402305374e-05, + "loss": 0.5907, + "step": 5688 + }, + { + "epoch": 0.44, + "grad_norm": 1.217449668298233, + "learning_rate": 1.2363212598063103e-05, + "loss": 0.6254, + "step": 5689 + }, + { + "epoch": 0.44, + "grad_norm": 1.2451542388788717, + "learning_rate": 1.2360771023870668e-05, + "loss": 0.6002, + "step": 5690 + }, + { + "epoch": 0.44, + "grad_norm": 1.2742246289705255, + "learning_rate": 1.2358329300630576e-05, + "loss": 0.6187, + "step": 5691 + }, + { + "epoch": 0.44, + "grad_norm": 1.1497961298975805, + "learning_rate": 1.2355887428496986e-05, + "loss": 0.5844, + "step": 5692 + }, + { + "epoch": 0.44, + "grad_norm": 1.109832991502645, + "learning_rate": 1.2353445407624071e-05, + "loss": 0.5787, + "step": 5693 + }, + { + "epoch": 0.44, + "grad_norm": 1.0901601462002743, + "learning_rate": 1.2351003238166004e-05, + "loss": 0.5423, + "step": 5694 + }, + { + "epoch": 0.44, + "grad_norm": 1.1664810146482412, + "learning_rate": 1.2348560920276973e-05, + "loss": 0.5297, + "step": 5695 + }, + { + "epoch": 0.44, + "grad_norm": 1.1852418107975102, + "learning_rate": 1.2346118454111176e-05, + "loss": 0.5865, + "step": 5696 + }, + { + "epoch": 0.44, + "grad_norm": 1.2308784892572304, + "learning_rate": 1.2343675839822813e-05, + "loss": 0.5934, + "step": 5697 + }, + { + "epoch": 0.44, + "grad_norm": 1.1731898751304157, + "learning_rate": 1.2341233077566104e-05, + "loss": 0.5783, + "step": 5698 + }, + { + "epoch": 0.44, + "grad_norm": 1.158076416083538, + "learning_rate": 1.2338790167495272e-05, + "loss": 0.5773, + "step": 5699 + }, + { + "epoch": 0.44, + "grad_norm": 1.0918072476516998, + "learning_rate": 1.2336347109764551e-05, + "loss": 0.5666, + "step": 5700 + }, + { + "epoch": 0.44, + "grad_norm": 1.172473907017691, + "learning_rate": 1.2333903904528182e-05, + "loss": 0.5694, + "step": 5701 + }, + { + "epoch": 0.44, + "grad_norm": 1.1775588985056196, + "learning_rate": 1.2331460551940417e-05, + "loss": 0.5915, + "step": 5702 + }, + { + "epoch": 0.44, + "grad_norm": 0.9588980703889227, + "learning_rate": 1.232901705215552e-05, + "loss": 0.5136, + "step": 5703 + }, + { + "epoch": 0.44, + "grad_norm": 1.2558507844859523, + "learning_rate": 1.232657340532776e-05, + "loss": 0.599, + "step": 5704 + }, + { + "epoch": 0.44, + "grad_norm": 1.279749852711989, + "learning_rate": 1.2324129611611417e-05, + "loss": 0.6208, + "step": 5705 + }, + { + "epoch": 0.44, + "grad_norm": 1.218997245553889, + "learning_rate": 1.2321685671160784e-05, + "loss": 0.5686, + "step": 5706 + }, + { + "epoch": 0.44, + "grad_norm": 1.0930544140002052, + "learning_rate": 1.231924158413015e-05, + "loss": 0.5025, + "step": 5707 + }, + { + "epoch": 0.44, + "grad_norm": 1.2123553927207038, + "learning_rate": 1.2316797350673834e-05, + "loss": 0.5576, + "step": 5708 + }, + { + "epoch": 0.44, + "grad_norm": 1.299455534185615, + "learning_rate": 1.2314352970946146e-05, + "loss": 0.5765, + "step": 5709 + }, + { + "epoch": 0.44, + "grad_norm": 1.1586965503731155, + "learning_rate": 1.2311908445101414e-05, + "loss": 0.6, + "step": 5710 + }, + { + "epoch": 0.44, + "grad_norm": 1.284343845332993, + "learning_rate": 1.2309463773293977e-05, + "loss": 0.6017, + "step": 5711 + }, + { + "epoch": 0.44, + "grad_norm": 1.0505608536689617, + "learning_rate": 1.2307018955678174e-05, + "loss": 0.5418, + "step": 5712 + }, + { + "epoch": 0.44, + "grad_norm": 1.1649264003769886, + "learning_rate": 1.2304573992408363e-05, + "loss": 0.5433, + "step": 5713 + }, + { + "epoch": 0.44, + "grad_norm": 1.1021096784667377, + "learning_rate": 1.2302128883638904e-05, + "loss": 0.5567, + "step": 5714 + }, + { + "epoch": 0.44, + "grad_norm": 1.1644419621262367, + "learning_rate": 1.2299683629524175e-05, + "loss": 0.5814, + "step": 5715 + }, + { + "epoch": 0.44, + "grad_norm": 1.3517347457586624, + "learning_rate": 1.2297238230218551e-05, + "loss": 0.6268, + "step": 5716 + }, + { + "epoch": 0.44, + "grad_norm": 1.2117459951420755, + "learning_rate": 1.2294792685876424e-05, + "loss": 0.5515, + "step": 5717 + }, + { + "epoch": 0.44, + "grad_norm": 1.1427577733368208, + "learning_rate": 1.2292346996652198e-05, + "loss": 0.5639, + "step": 5718 + }, + { + "epoch": 0.44, + "grad_norm": 1.1971142223518676, + "learning_rate": 1.2289901162700276e-05, + "loss": 0.5631, + "step": 5719 + }, + { + "epoch": 0.44, + "grad_norm": 1.2168086802979947, + "learning_rate": 1.228745518417508e-05, + "loss": 0.622, + "step": 5720 + }, + { + "epoch": 0.44, + "grad_norm": 1.2120541254176294, + "learning_rate": 1.228500906123104e-05, + "loss": 0.5595, + "step": 5721 + }, + { + "epoch": 0.44, + "grad_norm": 1.2924462498716816, + "learning_rate": 1.2282562794022586e-05, + "loss": 0.6143, + "step": 5722 + }, + { + "epoch": 0.44, + "grad_norm": 1.2596763401772133, + "learning_rate": 1.2280116382704166e-05, + "loss": 0.5961, + "step": 5723 + }, + { + "epoch": 0.44, + "grad_norm": 1.3843915296820637, + "learning_rate": 1.2277669827430234e-05, + "loss": 0.6031, + "step": 5724 + }, + { + "epoch": 0.44, + "grad_norm": 1.093212976770467, + "learning_rate": 1.2275223128355258e-05, + "loss": 0.5511, + "step": 5725 + }, + { + "epoch": 0.44, + "grad_norm": 1.1682795490631581, + "learning_rate": 1.2272776285633708e-05, + "loss": 0.5524, + "step": 5726 + }, + { + "epoch": 0.44, + "grad_norm": 1.2210305713166165, + "learning_rate": 1.2270329299420061e-05, + "loss": 0.6075, + "step": 5727 + }, + { + "epoch": 0.44, + "grad_norm": 1.4010380694072542, + "learning_rate": 1.2267882169868813e-05, + "loss": 0.5523, + "step": 5728 + }, + { + "epoch": 0.44, + "grad_norm": 1.1338893113350184, + "learning_rate": 1.2265434897134462e-05, + "loss": 0.5625, + "step": 5729 + }, + { + "epoch": 0.44, + "grad_norm": 1.1550434622080412, + "learning_rate": 1.2262987481371523e-05, + "loss": 0.5506, + "step": 5730 + }, + { + "epoch": 0.44, + "grad_norm": 1.0366448405524578, + "learning_rate": 1.2260539922734505e-05, + "loss": 0.5239, + "step": 5731 + }, + { + "epoch": 0.44, + "grad_norm": 1.378430119513949, + "learning_rate": 1.2258092221377938e-05, + "loss": 0.6496, + "step": 5732 + }, + { + "epoch": 0.44, + "grad_norm": 1.2256839302675868, + "learning_rate": 1.2255644377456357e-05, + "loss": 0.5612, + "step": 5733 + }, + { + "epoch": 0.44, + "grad_norm": 1.279751902020504, + "learning_rate": 1.2253196391124313e-05, + "loss": 0.6126, + "step": 5734 + }, + { + "epoch": 0.44, + "grad_norm": 1.1559703591846446, + "learning_rate": 1.2250748262536357e-05, + "loss": 0.5656, + "step": 5735 + }, + { + "epoch": 0.44, + "grad_norm": 1.1631600895200256, + "learning_rate": 1.2248299991847048e-05, + "loss": 0.4909, + "step": 5736 + }, + { + "epoch": 0.45, + "grad_norm": 1.1719034827267767, + "learning_rate": 1.2245851579210958e-05, + "loss": 0.5376, + "step": 5737 + }, + { + "epoch": 0.45, + "grad_norm": 1.1778271387111043, + "learning_rate": 1.2243403024782675e-05, + "loss": 0.5458, + "step": 5738 + }, + { + "epoch": 0.45, + "grad_norm": 1.2291002201719787, + "learning_rate": 1.2240954328716783e-05, + "loss": 0.5324, + "step": 5739 + }, + { + "epoch": 0.45, + "grad_norm": 1.1940587767857695, + "learning_rate": 1.2238505491167884e-05, + "loss": 0.5983, + "step": 5740 + }, + { + "epoch": 0.45, + "grad_norm": 1.1452531848655838, + "learning_rate": 1.2236056512290584e-05, + "loss": 0.5081, + "step": 5741 + }, + { + "epoch": 0.45, + "grad_norm": 1.1813157825217495, + "learning_rate": 1.2233607392239497e-05, + "loss": 0.5692, + "step": 5742 + }, + { + "epoch": 0.45, + "grad_norm": 1.171500082441743, + "learning_rate": 1.2231158131169251e-05, + "loss": 0.5278, + "step": 5743 + }, + { + "epoch": 0.45, + "grad_norm": 1.1124261617266489, + "learning_rate": 1.2228708729234487e-05, + "loss": 0.5186, + "step": 5744 + }, + { + "epoch": 0.45, + "grad_norm": 1.0741828496309291, + "learning_rate": 1.222625918658984e-05, + "loss": 0.5729, + "step": 5745 + }, + { + "epoch": 0.45, + "grad_norm": 1.3032568665891962, + "learning_rate": 1.2223809503389962e-05, + "loss": 0.5913, + "step": 5746 + }, + { + "epoch": 0.45, + "grad_norm": 1.172448285066508, + "learning_rate": 1.2221359679789518e-05, + "loss": 0.6391, + "step": 5747 + }, + { + "epoch": 0.45, + "grad_norm": 1.1912769184893888, + "learning_rate": 1.2218909715943174e-05, + "loss": 0.5524, + "step": 5748 + }, + { + "epoch": 0.45, + "grad_norm": 1.190448413479408, + "learning_rate": 1.2216459612005616e-05, + "loss": 0.5552, + "step": 5749 + }, + { + "epoch": 0.45, + "grad_norm": 1.1883309117560483, + "learning_rate": 1.2214009368131525e-05, + "loss": 0.5022, + "step": 5750 + }, + { + "epoch": 0.45, + "grad_norm": 1.273073905103042, + "learning_rate": 1.22115589844756e-05, + "loss": 0.5961, + "step": 5751 + }, + { + "epoch": 0.45, + "grad_norm": 1.3431829098678116, + "learning_rate": 1.2209108461192546e-05, + "loss": 0.5901, + "step": 5752 + }, + { + "epoch": 0.45, + "grad_norm": 1.135715736874924, + "learning_rate": 1.2206657798437078e-05, + "loss": 0.541, + "step": 5753 + }, + { + "epoch": 0.45, + "grad_norm": 1.1776193338008218, + "learning_rate": 1.2204206996363918e-05, + "loss": 0.5781, + "step": 5754 + }, + { + "epoch": 0.45, + "grad_norm": 1.2144509321299104, + "learning_rate": 1.2201756055127798e-05, + "loss": 0.5979, + "step": 5755 + }, + { + "epoch": 0.45, + "grad_norm": 1.0928416158426182, + "learning_rate": 1.219930497488346e-05, + "loss": 0.573, + "step": 5756 + }, + { + "epoch": 0.45, + "grad_norm": 1.1879574246050468, + "learning_rate": 1.2196853755785649e-05, + "loss": 0.5774, + "step": 5757 + }, + { + "epoch": 0.45, + "grad_norm": 1.2330869400639546, + "learning_rate": 1.2194402397989128e-05, + "loss": 0.5453, + "step": 5758 + }, + { + "epoch": 0.45, + "grad_norm": 1.1897570091885952, + "learning_rate": 1.2191950901648664e-05, + "loss": 0.527, + "step": 5759 + }, + { + "epoch": 0.45, + "grad_norm": 1.080463637245422, + "learning_rate": 1.2189499266919028e-05, + "loss": 0.5033, + "step": 5760 + }, + { + "epoch": 0.45, + "grad_norm": 1.1655886074245116, + "learning_rate": 1.2187047493955006e-05, + "loss": 0.5931, + "step": 5761 + }, + { + "epoch": 0.45, + "grad_norm": 1.174902197643059, + "learning_rate": 1.2184595582911394e-05, + "loss": 0.5324, + "step": 5762 + }, + { + "epoch": 0.45, + "grad_norm": 1.2115452379635694, + "learning_rate": 1.218214353394299e-05, + "loss": 0.5163, + "step": 5763 + }, + { + "epoch": 0.45, + "grad_norm": 1.1526727756237607, + "learning_rate": 1.2179691347204609e-05, + "loss": 0.4868, + "step": 5764 + }, + { + "epoch": 0.45, + "grad_norm": 1.2146941457791842, + "learning_rate": 1.2177239022851068e-05, + "loss": 0.5646, + "step": 5765 + }, + { + "epoch": 0.45, + "grad_norm": 1.1720135416153417, + "learning_rate": 1.2174786561037191e-05, + "loss": 0.5774, + "step": 5766 + }, + { + "epoch": 0.45, + "grad_norm": 1.2000503390403146, + "learning_rate": 1.2172333961917819e-05, + "loss": 0.5609, + "step": 5767 + }, + { + "epoch": 0.45, + "grad_norm": 1.1779116470253348, + "learning_rate": 1.2169881225647797e-05, + "loss": 0.5655, + "step": 5768 + }, + { + "epoch": 0.45, + "grad_norm": 1.1407609100890166, + "learning_rate": 1.2167428352381977e-05, + "loss": 0.5076, + "step": 5769 + }, + { + "epoch": 0.45, + "grad_norm": 1.0374528276680997, + "learning_rate": 1.2164975342275227e-05, + "loss": 0.514, + "step": 5770 + }, + { + "epoch": 0.45, + "grad_norm": 1.1877798955226615, + "learning_rate": 1.2162522195482408e-05, + "loss": 0.5438, + "step": 5771 + }, + { + "epoch": 0.45, + "grad_norm": 1.1487102573896208, + "learning_rate": 1.2160068912158408e-05, + "loss": 0.544, + "step": 5772 + }, + { + "epoch": 0.45, + "grad_norm": 1.2242168180624513, + "learning_rate": 1.2157615492458113e-05, + "loss": 0.5536, + "step": 5773 + }, + { + "epoch": 0.45, + "grad_norm": 1.101818082121585, + "learning_rate": 1.2155161936536422e-05, + "loss": 0.5893, + "step": 5774 + }, + { + "epoch": 0.45, + "grad_norm": 1.083124538009906, + "learning_rate": 1.2152708244548237e-05, + "loss": 0.5432, + "step": 5775 + }, + { + "epoch": 0.45, + "grad_norm": 1.1948722577151858, + "learning_rate": 1.2150254416648473e-05, + "loss": 0.541, + "step": 5776 + }, + { + "epoch": 0.45, + "grad_norm": 1.2256619980755752, + "learning_rate": 1.2147800452992055e-05, + "loss": 0.5762, + "step": 5777 + }, + { + "epoch": 0.45, + "grad_norm": 1.1837845953747075, + "learning_rate": 1.2145346353733913e-05, + "loss": 0.5393, + "step": 5778 + }, + { + "epoch": 0.45, + "grad_norm": 1.1906830140245028, + "learning_rate": 1.2142892119028988e-05, + "loss": 0.5614, + "step": 5779 + }, + { + "epoch": 0.45, + "grad_norm": 1.1461272065129666, + "learning_rate": 1.2140437749032227e-05, + "loss": 0.5559, + "step": 5780 + }, + { + "epoch": 0.45, + "grad_norm": 1.1644388396965029, + "learning_rate": 1.2137983243898585e-05, + "loss": 0.565, + "step": 5781 + }, + { + "epoch": 0.45, + "grad_norm": 1.2571805705259522, + "learning_rate": 1.2135528603783036e-05, + "loss": 0.5691, + "step": 5782 + }, + { + "epoch": 0.45, + "grad_norm": 1.0542582450763196, + "learning_rate": 1.2133073828840543e-05, + "loss": 0.5131, + "step": 5783 + }, + { + "epoch": 0.45, + "grad_norm": 1.1099808341596908, + "learning_rate": 1.2130618919226099e-05, + "loss": 0.5439, + "step": 5784 + }, + { + "epoch": 0.45, + "grad_norm": 1.2504267917635397, + "learning_rate": 1.2128163875094687e-05, + "loss": 0.614, + "step": 5785 + }, + { + "epoch": 0.45, + "grad_norm": 1.2536251430391578, + "learning_rate": 1.2125708696601309e-05, + "loss": 0.64, + "step": 5786 + }, + { + "epoch": 0.45, + "grad_norm": 1.2027608146871105, + "learning_rate": 1.2123253383900974e-05, + "loss": 0.579, + "step": 5787 + }, + { + "epoch": 0.45, + "grad_norm": 1.2219459005795537, + "learning_rate": 1.2120797937148699e-05, + "loss": 0.608, + "step": 5788 + }, + { + "epoch": 0.45, + "grad_norm": 1.1487539984101123, + "learning_rate": 1.2118342356499508e-05, + "loss": 0.5553, + "step": 5789 + }, + { + "epoch": 0.45, + "grad_norm": 1.0749243620943467, + "learning_rate": 1.2115886642108437e-05, + "loss": 0.5285, + "step": 5790 + }, + { + "epoch": 0.45, + "grad_norm": 1.0901008767511062, + "learning_rate": 1.2113430794130522e-05, + "loss": 0.5143, + "step": 5791 + }, + { + "epoch": 0.45, + "grad_norm": 1.109954843633605, + "learning_rate": 1.2110974812720819e-05, + "loss": 0.5127, + "step": 5792 + }, + { + "epoch": 0.45, + "grad_norm": 1.1113190608668968, + "learning_rate": 1.2108518698034384e-05, + "loss": 0.5305, + "step": 5793 + }, + { + "epoch": 0.45, + "grad_norm": 1.1599832527826188, + "learning_rate": 1.2106062450226287e-05, + "loss": 0.5953, + "step": 5794 + }, + { + "epoch": 0.45, + "grad_norm": 1.1504900489419883, + "learning_rate": 1.2103606069451601e-05, + "loss": 0.5835, + "step": 5795 + }, + { + "epoch": 0.45, + "grad_norm": 1.2313824848601616, + "learning_rate": 1.210114955586541e-05, + "loss": 0.6057, + "step": 5796 + }, + { + "epoch": 0.45, + "grad_norm": 1.2464898893831475, + "learning_rate": 1.2098692909622808e-05, + "loss": 0.5481, + "step": 5797 + }, + { + "epoch": 0.45, + "grad_norm": 1.2924023911974978, + "learning_rate": 1.2096236130878894e-05, + "loss": 0.5437, + "step": 5798 + }, + { + "epoch": 0.45, + "grad_norm": 1.1411081623293935, + "learning_rate": 1.2093779219788777e-05, + "loss": 0.5064, + "step": 5799 + }, + { + "epoch": 0.45, + "grad_norm": 1.062769631225352, + "learning_rate": 1.2091322176507579e-05, + "loss": 0.4804, + "step": 5800 + }, + { + "epoch": 0.45, + "grad_norm": 1.0799403180303564, + "learning_rate": 1.2088865001190418e-05, + "loss": 0.5354, + "step": 5801 + }, + { + "epoch": 0.45, + "grad_norm": 1.2463022374537462, + "learning_rate": 1.2086407693992434e-05, + "loss": 0.6065, + "step": 5802 + }, + { + "epoch": 0.45, + "grad_norm": 1.106121396278204, + "learning_rate": 1.2083950255068766e-05, + "loss": 0.5653, + "step": 5803 + }, + { + "epoch": 0.45, + "grad_norm": 1.1750484476855685, + "learning_rate": 1.208149268457457e-05, + "loss": 0.5781, + "step": 5804 + }, + { + "epoch": 0.45, + "grad_norm": 1.173305147430331, + "learning_rate": 1.2079034982665001e-05, + "loss": 0.5533, + "step": 5805 + }, + { + "epoch": 0.45, + "grad_norm": 1.1877700097147526, + "learning_rate": 1.2076577149495226e-05, + "loss": 0.575, + "step": 5806 + }, + { + "epoch": 0.45, + "grad_norm": 1.226780185698904, + "learning_rate": 1.207411918522042e-05, + "loss": 0.5951, + "step": 5807 + }, + { + "epoch": 0.45, + "grad_norm": 1.1669546464444174, + "learning_rate": 1.2071661089995772e-05, + "loss": 0.5916, + "step": 5808 + }, + { + "epoch": 0.45, + "grad_norm": 1.147030493981939, + "learning_rate": 1.2069202863976471e-05, + "loss": 0.5472, + "step": 5809 + }, + { + "epoch": 0.45, + "grad_norm": 1.1124389138756328, + "learning_rate": 1.2066744507317718e-05, + "loss": 0.566, + "step": 5810 + }, + { + "epoch": 0.45, + "grad_norm": 1.2951715151052539, + "learning_rate": 1.2064286020174718e-05, + "loss": 0.5823, + "step": 5811 + }, + { + "epoch": 0.45, + "grad_norm": 1.2010264238822317, + "learning_rate": 1.2061827402702691e-05, + "loss": 0.5236, + "step": 5812 + }, + { + "epoch": 0.45, + "grad_norm": 1.1301946448743838, + "learning_rate": 1.2059368655056864e-05, + "loss": 0.5331, + "step": 5813 + }, + { + "epoch": 0.45, + "grad_norm": 1.1961510403477786, + "learning_rate": 1.2056909777392471e-05, + "loss": 0.6108, + "step": 5814 + }, + { + "epoch": 0.45, + "grad_norm": 1.18671762894558, + "learning_rate": 1.2054450769864752e-05, + "loss": 0.5675, + "step": 5815 + }, + { + "epoch": 0.45, + "grad_norm": 1.1760007330249915, + "learning_rate": 1.2051991632628952e-05, + "loss": 0.5775, + "step": 5816 + }, + { + "epoch": 0.45, + "grad_norm": 1.16608463255676, + "learning_rate": 1.2049532365840333e-05, + "loss": 0.6161, + "step": 5817 + }, + { + "epoch": 0.45, + "grad_norm": 1.1621810249872406, + "learning_rate": 1.2047072969654165e-05, + "loss": 0.5382, + "step": 5818 + }, + { + "epoch": 0.45, + "grad_norm": 1.1802851192660717, + "learning_rate": 1.204461344422572e-05, + "loss": 0.5502, + "step": 5819 + }, + { + "epoch": 0.45, + "grad_norm": 1.1400551613629568, + "learning_rate": 1.2042153789710278e-05, + "loss": 0.5125, + "step": 5820 + }, + { + "epoch": 0.45, + "grad_norm": 1.1618702361946471, + "learning_rate": 1.2039694006263129e-05, + "loss": 0.5481, + "step": 5821 + }, + { + "epoch": 0.45, + "grad_norm": 1.1806254416081747, + "learning_rate": 1.2037234094039573e-05, + "loss": 0.5591, + "step": 5822 + }, + { + "epoch": 0.45, + "grad_norm": 1.2368320203564345, + "learning_rate": 1.2034774053194922e-05, + "loss": 0.5767, + "step": 5823 + }, + { + "epoch": 0.45, + "grad_norm": 1.1410431813820914, + "learning_rate": 1.2032313883884485e-05, + "loss": 0.5382, + "step": 5824 + }, + { + "epoch": 0.45, + "grad_norm": 1.222155678748634, + "learning_rate": 1.202985358626359e-05, + "loss": 0.5459, + "step": 5825 + }, + { + "epoch": 0.45, + "grad_norm": 1.202775978887758, + "learning_rate": 1.2027393160487561e-05, + "loss": 0.5589, + "step": 5826 + }, + { + "epoch": 0.45, + "grad_norm": 1.1627100826474233, + "learning_rate": 1.2024932606711741e-05, + "loss": 0.5983, + "step": 5827 + }, + { + "epoch": 0.45, + "grad_norm": 1.1437455432575716, + "learning_rate": 1.2022471925091483e-05, + "loss": 0.5652, + "step": 5828 + }, + { + "epoch": 0.45, + "grad_norm": 1.13861978977748, + "learning_rate": 1.2020011115782135e-05, + "loss": 0.5256, + "step": 5829 + }, + { + "epoch": 0.45, + "grad_norm": 1.0433482076317901, + "learning_rate": 1.2017550178939064e-05, + "loss": 0.5304, + "step": 5830 + }, + { + "epoch": 0.45, + "grad_norm": 1.1142188156485253, + "learning_rate": 1.2015089114717642e-05, + "loss": 0.5115, + "step": 5831 + }, + { + "epoch": 0.45, + "grad_norm": 1.1799556099090744, + "learning_rate": 1.2012627923273246e-05, + "loss": 0.5378, + "step": 5832 + }, + { + "epoch": 0.45, + "grad_norm": 1.2779304020486262, + "learning_rate": 1.2010166604761266e-05, + "loss": 0.5505, + "step": 5833 + }, + { + "epoch": 0.45, + "grad_norm": 1.2344693015087467, + "learning_rate": 1.20077051593371e-05, + "loss": 0.5774, + "step": 5834 + }, + { + "epoch": 0.45, + "grad_norm": 1.1387612780484628, + "learning_rate": 1.2005243587156143e-05, + "loss": 0.5335, + "step": 5835 + }, + { + "epoch": 0.45, + "grad_norm": 1.1495471186992472, + "learning_rate": 1.2002781888373818e-05, + "loss": 0.5972, + "step": 5836 + }, + { + "epoch": 0.45, + "grad_norm": 1.1760604374854247, + "learning_rate": 1.2000320063145536e-05, + "loss": 0.6117, + "step": 5837 + }, + { + "epoch": 0.45, + "grad_norm": 1.1046612489594707, + "learning_rate": 1.199785811162673e-05, + "loss": 0.5099, + "step": 5838 + }, + { + "epoch": 0.45, + "grad_norm": 1.1250570070869994, + "learning_rate": 1.1995396033972834e-05, + "loss": 0.5631, + "step": 5839 + }, + { + "epoch": 0.45, + "grad_norm": 1.110835765789218, + "learning_rate": 1.1992933830339288e-05, + "loss": 0.502, + "step": 5840 + }, + { + "epoch": 0.45, + "grad_norm": 1.1313335893634813, + "learning_rate": 1.1990471500881551e-05, + "loss": 0.593, + "step": 5841 + }, + { + "epoch": 0.45, + "grad_norm": 1.1774903609251823, + "learning_rate": 1.1988009045755077e-05, + "loss": 0.5985, + "step": 5842 + }, + { + "epoch": 0.45, + "grad_norm": 1.2243414042655396, + "learning_rate": 1.1985546465115336e-05, + "loss": 0.5613, + "step": 5843 + }, + { + "epoch": 0.45, + "grad_norm": 1.1738508733236663, + "learning_rate": 1.1983083759117804e-05, + "loss": 0.5925, + "step": 5844 + }, + { + "epoch": 0.45, + "grad_norm": 1.0072078813318732, + "learning_rate": 1.1980620927917958e-05, + "loss": 0.5195, + "step": 5845 + }, + { + "epoch": 0.45, + "grad_norm": 1.1427341452470745, + "learning_rate": 1.19781579716713e-05, + "loss": 0.5703, + "step": 5846 + }, + { + "epoch": 0.45, + "grad_norm": 1.3303544573880208, + "learning_rate": 1.1975694890533318e-05, + "loss": 0.5964, + "step": 5847 + }, + { + "epoch": 0.45, + "grad_norm": 1.2233942504097892, + "learning_rate": 1.1973231684659527e-05, + "loss": 0.6063, + "step": 5848 + }, + { + "epoch": 0.45, + "grad_norm": 1.2731503121842511, + "learning_rate": 1.1970768354205443e-05, + "loss": 0.5583, + "step": 5849 + }, + { + "epoch": 0.45, + "grad_norm": 1.0754297816765037, + "learning_rate": 1.1968304899326577e-05, + "loss": 0.5909, + "step": 5850 + }, + { + "epoch": 0.45, + "grad_norm": 1.1399600559899563, + "learning_rate": 1.1965841320178473e-05, + "loss": 0.5293, + "step": 5851 + }, + { + "epoch": 0.45, + "grad_norm": 1.2077981432299578, + "learning_rate": 1.1963377616916662e-05, + "loss": 0.5339, + "step": 5852 + }, + { + "epoch": 0.45, + "grad_norm": 1.2131592089208965, + "learning_rate": 1.1960913789696694e-05, + "loss": 0.5758, + "step": 5853 + }, + { + "epoch": 0.45, + "grad_norm": 1.215401769700864, + "learning_rate": 1.1958449838674122e-05, + "loss": 0.5389, + "step": 5854 + }, + { + "epoch": 0.45, + "grad_norm": 1.1799737444074103, + "learning_rate": 1.1955985764004504e-05, + "loss": 0.5464, + "step": 5855 + }, + { + "epoch": 0.45, + "grad_norm": 1.2863515843054454, + "learning_rate": 1.1953521565843415e-05, + "loss": 0.6042, + "step": 5856 + }, + { + "epoch": 0.45, + "grad_norm": 1.204111561729822, + "learning_rate": 1.195105724434643e-05, + "loss": 0.5337, + "step": 5857 + }, + { + "epoch": 0.45, + "grad_norm": 1.1542259233230394, + "learning_rate": 1.1948592799669136e-05, + "loss": 0.5513, + "step": 5858 + }, + { + "epoch": 0.45, + "grad_norm": 1.0744879367980187, + "learning_rate": 1.1946128231967127e-05, + "loss": 0.5253, + "step": 5859 + }, + { + "epoch": 0.45, + "grad_norm": 1.0442689941592997, + "learning_rate": 1.1943663541395998e-05, + "loss": 0.5464, + "step": 5860 + }, + { + "epoch": 0.45, + "grad_norm": 1.1750569187744138, + "learning_rate": 1.1941198728111364e-05, + "loss": 0.571, + "step": 5861 + }, + { + "epoch": 0.45, + "grad_norm": 1.1717132456724335, + "learning_rate": 1.1938733792268837e-05, + "loss": 0.5556, + "step": 5862 + }, + { + "epoch": 0.45, + "grad_norm": 1.1851471630897263, + "learning_rate": 1.1936268734024048e-05, + "loss": 0.5668, + "step": 5863 + }, + { + "epoch": 0.45, + "grad_norm": 1.1747150339368848, + "learning_rate": 1.1933803553532622e-05, + "loss": 0.5544, + "step": 5864 + }, + { + "epoch": 0.45, + "grad_norm": 1.1452167007744813, + "learning_rate": 1.1931338250950197e-05, + "loss": 0.5183, + "step": 5865 + }, + { + "epoch": 0.46, + "grad_norm": 1.194493181259474, + "learning_rate": 1.1928872826432427e-05, + "loss": 0.5388, + "step": 5866 + }, + { + "epoch": 0.46, + "grad_norm": 1.1237559327508897, + "learning_rate": 1.1926407280134962e-05, + "loss": 0.5561, + "step": 5867 + }, + { + "epoch": 0.46, + "grad_norm": 1.3663842593692355, + "learning_rate": 1.1923941612213468e-05, + "loss": 0.5684, + "step": 5868 + }, + { + "epoch": 0.46, + "grad_norm": 1.2856317817238363, + "learning_rate": 1.1921475822823613e-05, + "loss": 0.6646, + "step": 5869 + }, + { + "epoch": 0.46, + "grad_norm": 1.0807219478258843, + "learning_rate": 1.1919009912121075e-05, + "loss": 0.5468, + "step": 5870 + }, + { + "epoch": 0.46, + "grad_norm": 1.1001585542676102, + "learning_rate": 1.1916543880261541e-05, + "loss": 0.5462, + "step": 5871 + }, + { + "epoch": 0.46, + "grad_norm": 1.210953103241838, + "learning_rate": 1.1914077727400706e-05, + "loss": 0.5531, + "step": 5872 + }, + { + "epoch": 0.46, + "grad_norm": 1.1054588290466927, + "learning_rate": 1.1911611453694267e-05, + "loss": 0.5374, + "step": 5873 + }, + { + "epoch": 0.46, + "grad_norm": 1.14059197364965, + "learning_rate": 1.1909145059297935e-05, + "loss": 0.527, + "step": 5874 + }, + { + "epoch": 0.46, + "grad_norm": 1.1699562131399075, + "learning_rate": 1.1906678544367423e-05, + "loss": 0.5134, + "step": 5875 + }, + { + "epoch": 0.46, + "grad_norm": 1.2421120194828592, + "learning_rate": 1.190421190905846e-05, + "loss": 0.5647, + "step": 5876 + }, + { + "epoch": 0.46, + "grad_norm": 1.2952461124428827, + "learning_rate": 1.1901745153526773e-05, + "loss": 0.5676, + "step": 5877 + }, + { + "epoch": 0.46, + "grad_norm": 1.2217136442373169, + "learning_rate": 1.1899278277928103e-05, + "loss": 0.594, + "step": 5878 + }, + { + "epoch": 0.46, + "grad_norm": 1.1674616296413667, + "learning_rate": 1.1896811282418199e-05, + "loss": 0.5698, + "step": 5879 + }, + { + "epoch": 0.46, + "grad_norm": 1.213094598938523, + "learning_rate": 1.1894344167152809e-05, + "loss": 0.6098, + "step": 5880 + }, + { + "epoch": 0.46, + "grad_norm": 1.238448026479051, + "learning_rate": 1.1891876932287701e-05, + "loss": 0.6178, + "step": 5881 + }, + { + "epoch": 0.46, + "grad_norm": 1.2836003494325223, + "learning_rate": 1.1889409577978639e-05, + "loss": 0.6002, + "step": 5882 + }, + { + "epoch": 0.46, + "grad_norm": 1.1942288841247801, + "learning_rate": 1.1886942104381403e-05, + "loss": 0.5484, + "step": 5883 + }, + { + "epoch": 0.46, + "grad_norm": 1.3050263929803976, + "learning_rate": 1.1884474511651778e-05, + "loss": 0.5789, + "step": 5884 + }, + { + "epoch": 0.46, + "grad_norm": 1.208662739062639, + "learning_rate": 1.1882006799945551e-05, + "loss": 0.6153, + "step": 5885 + }, + { + "epoch": 0.46, + "grad_norm": 1.0800346379553527, + "learning_rate": 1.1879538969418526e-05, + "loss": 0.568, + "step": 5886 + }, + { + "epoch": 0.46, + "grad_norm": 1.1419719424801138, + "learning_rate": 1.1877071020226512e-05, + "loss": 0.5483, + "step": 5887 + }, + { + "epoch": 0.46, + "grad_norm": 1.2478569733996008, + "learning_rate": 1.1874602952525317e-05, + "loss": 0.5996, + "step": 5888 + }, + { + "epoch": 0.46, + "grad_norm": 1.3204928844856738, + "learning_rate": 1.1872134766470769e-05, + "loss": 0.6373, + "step": 5889 + }, + { + "epoch": 0.46, + "grad_norm": 1.1594939219721387, + "learning_rate": 1.1869666462218693e-05, + "loss": 0.5458, + "step": 5890 + }, + { + "epoch": 0.46, + "grad_norm": 1.0793814593874516, + "learning_rate": 1.1867198039924923e-05, + "loss": 0.516, + "step": 5891 + }, + { + "epoch": 0.46, + "grad_norm": 1.1567645088079015, + "learning_rate": 1.1864729499745312e-05, + "loss": 0.5673, + "step": 5892 + }, + { + "epoch": 0.46, + "grad_norm": 1.0312249729703138, + "learning_rate": 1.1862260841835706e-05, + "loss": 0.5082, + "step": 5893 + }, + { + "epoch": 0.46, + "grad_norm": 1.086463690964292, + "learning_rate": 1.1859792066351964e-05, + "loss": 0.5152, + "step": 5894 + }, + { + "epoch": 0.46, + "grad_norm": 1.121048344878929, + "learning_rate": 1.1857323173449956e-05, + "loss": 0.5682, + "step": 5895 + }, + { + "epoch": 0.46, + "grad_norm": 1.3202165196310283, + "learning_rate": 1.1854854163285548e-05, + "loss": 0.6055, + "step": 5896 + }, + { + "epoch": 0.46, + "grad_norm": 1.186041136624872, + "learning_rate": 1.185238503601463e-05, + "loss": 0.5791, + "step": 5897 + }, + { + "epoch": 0.46, + "grad_norm": 1.22254216938579, + "learning_rate": 1.1849915791793091e-05, + "loss": 0.5845, + "step": 5898 + }, + { + "epoch": 0.46, + "grad_norm": 1.1606132044113173, + "learning_rate": 1.1847446430776822e-05, + "loss": 0.5692, + "step": 5899 + }, + { + "epoch": 0.46, + "grad_norm": 1.1930835361379735, + "learning_rate": 1.1844976953121725e-05, + "loss": 0.5913, + "step": 5900 + }, + { + "epoch": 0.46, + "grad_norm": 1.1986993416592568, + "learning_rate": 1.1842507358983715e-05, + "loss": 0.6328, + "step": 5901 + }, + { + "epoch": 0.46, + "grad_norm": 1.1882834610677058, + "learning_rate": 1.1840037648518712e-05, + "loss": 0.5788, + "step": 5902 + }, + { + "epoch": 0.46, + "grad_norm": 1.1837200940875108, + "learning_rate": 1.1837567821882638e-05, + "loss": 0.5791, + "step": 5903 + }, + { + "epoch": 0.46, + "grad_norm": 1.1003249772070514, + "learning_rate": 1.1835097879231427e-05, + "loss": 0.5656, + "step": 5904 + }, + { + "epoch": 0.46, + "grad_norm": 1.2701610235574328, + "learning_rate": 1.1832627820721017e-05, + "loss": 0.5365, + "step": 5905 + }, + { + "epoch": 0.46, + "grad_norm": 1.143477804594884, + "learning_rate": 1.1830157646507358e-05, + "loss": 0.5089, + "step": 5906 + }, + { + "epoch": 0.46, + "grad_norm": 1.2677134484235162, + "learning_rate": 1.1827687356746406e-05, + "loss": 0.5629, + "step": 5907 + }, + { + "epoch": 0.46, + "grad_norm": 1.1531484089778088, + "learning_rate": 1.182521695159412e-05, + "loss": 0.539, + "step": 5908 + }, + { + "epoch": 0.46, + "grad_norm": 1.1058788280332412, + "learning_rate": 1.1822746431206473e-05, + "loss": 0.5695, + "step": 5909 + }, + { + "epoch": 0.46, + "grad_norm": 1.1808895025217803, + "learning_rate": 1.1820275795739438e-05, + "loss": 0.5691, + "step": 5910 + }, + { + "epoch": 0.46, + "grad_norm": 1.2212088307964457, + "learning_rate": 1.1817805045349e-05, + "loss": 0.6193, + "step": 5911 + }, + { + "epoch": 0.46, + "grad_norm": 1.1192837768393247, + "learning_rate": 1.1815334180191153e-05, + "loss": 0.5424, + "step": 5912 + }, + { + "epoch": 0.46, + "grad_norm": 1.2534083153145257, + "learning_rate": 1.1812863200421894e-05, + "loss": 0.5966, + "step": 5913 + }, + { + "epoch": 0.46, + "grad_norm": 1.2925274144701855, + "learning_rate": 1.1810392106197224e-05, + "loss": 0.6072, + "step": 5914 + }, + { + "epoch": 0.46, + "grad_norm": 1.1839332723082427, + "learning_rate": 1.1807920897673162e-05, + "loss": 0.55, + "step": 5915 + }, + { + "epoch": 0.46, + "grad_norm": 1.1519622688538522, + "learning_rate": 1.1805449575005726e-05, + "loss": 0.5678, + "step": 5916 + }, + { + "epoch": 0.46, + "grad_norm": 1.1629580179970105, + "learning_rate": 1.1802978138350945e-05, + "loss": 0.5458, + "step": 5917 + }, + { + "epoch": 0.46, + "grad_norm": 1.1714943839902012, + "learning_rate": 1.1800506587864851e-05, + "loss": 0.5418, + "step": 5918 + }, + { + "epoch": 0.46, + "grad_norm": 1.243544838628027, + "learning_rate": 1.1798034923703486e-05, + "loss": 0.6272, + "step": 5919 + }, + { + "epoch": 0.46, + "grad_norm": 1.1981541881876347, + "learning_rate": 1.17955631460229e-05, + "loss": 0.519, + "step": 5920 + }, + { + "epoch": 0.46, + "grad_norm": 1.1811447237927264, + "learning_rate": 1.1793091254979148e-05, + "loss": 0.5514, + "step": 5921 + }, + { + "epoch": 0.46, + "grad_norm": 1.1235853413593588, + "learning_rate": 1.1790619250728295e-05, + "loss": 0.5534, + "step": 5922 + }, + { + "epoch": 0.46, + "grad_norm": 1.1624878851966867, + "learning_rate": 1.178814713342641e-05, + "loss": 0.5502, + "step": 5923 + }, + { + "epoch": 0.46, + "grad_norm": 1.114220260000697, + "learning_rate": 1.1785674903229572e-05, + "loss": 0.55, + "step": 5924 + }, + { + "epoch": 0.46, + "grad_norm": 1.2299317171945692, + "learning_rate": 1.1783202560293863e-05, + "loss": 0.5908, + "step": 5925 + }, + { + "epoch": 0.46, + "grad_norm": 1.234587301032679, + "learning_rate": 1.1780730104775374e-05, + "loss": 0.5998, + "step": 5926 + }, + { + "epoch": 0.46, + "grad_norm": 1.0639864117141906, + "learning_rate": 1.1778257536830211e-05, + "loss": 0.5353, + "step": 5927 + }, + { + "epoch": 0.46, + "grad_norm": 1.1595802289999118, + "learning_rate": 1.1775784856614473e-05, + "loss": 0.5223, + "step": 5928 + }, + { + "epoch": 0.46, + "grad_norm": 1.125575077574416, + "learning_rate": 1.1773312064284275e-05, + "loss": 0.5552, + "step": 5929 + }, + { + "epoch": 0.46, + "grad_norm": 1.1473486273162008, + "learning_rate": 1.1770839159995738e-05, + "loss": 0.5426, + "step": 5930 + }, + { + "epoch": 0.46, + "grad_norm": 1.132527966775059, + "learning_rate": 1.1768366143904986e-05, + "loss": 0.5297, + "step": 5931 + }, + { + "epoch": 0.46, + "grad_norm": 1.0903657599637857, + "learning_rate": 1.1765893016168158e-05, + "loss": 0.5232, + "step": 5932 + }, + { + "epoch": 0.46, + "grad_norm": 1.1708886636893996, + "learning_rate": 1.1763419776941395e-05, + "loss": 0.616, + "step": 5933 + }, + { + "epoch": 0.46, + "grad_norm": 1.2104614798850823, + "learning_rate": 1.1760946426380838e-05, + "loss": 0.5964, + "step": 5934 + }, + { + "epoch": 0.46, + "grad_norm": 1.1134263328915142, + "learning_rate": 1.1758472964642651e-05, + "loss": 0.5122, + "step": 5935 + }, + { + "epoch": 0.46, + "grad_norm": 1.1541368919048283, + "learning_rate": 1.1755999391882993e-05, + "loss": 0.5348, + "step": 5936 + }, + { + "epoch": 0.46, + "grad_norm": 1.1571005708378703, + "learning_rate": 1.1753525708258034e-05, + "loss": 0.5326, + "step": 5937 + }, + { + "epoch": 0.46, + "grad_norm": 1.1534524013452785, + "learning_rate": 1.175105191392395e-05, + "loss": 0.6112, + "step": 5938 + }, + { + "epoch": 0.46, + "grad_norm": 1.207627183285773, + "learning_rate": 1.1748578009036925e-05, + "loss": 0.5764, + "step": 5939 + }, + { + "epoch": 0.46, + "grad_norm": 1.0799936327040143, + "learning_rate": 1.1746103993753146e-05, + "loss": 0.4969, + "step": 5940 + }, + { + "epoch": 0.46, + "grad_norm": 1.0990050910054272, + "learning_rate": 1.1743629868228815e-05, + "loss": 0.5508, + "step": 5941 + }, + { + "epoch": 0.46, + "grad_norm": 1.0899727584339496, + "learning_rate": 1.1741155632620135e-05, + "loss": 0.5599, + "step": 5942 + }, + { + "epoch": 0.46, + "grad_norm": 1.192224390501563, + "learning_rate": 1.1738681287083318e-05, + "loss": 0.5345, + "step": 5943 + }, + { + "epoch": 0.46, + "grad_norm": 2.0077912444536987, + "learning_rate": 1.1736206831774576e-05, + "loss": 0.5608, + "step": 5944 + }, + { + "epoch": 0.46, + "grad_norm": 0.9353286074428118, + "learning_rate": 1.1733732266850144e-05, + "loss": 0.4908, + "step": 5945 + }, + { + "epoch": 0.46, + "grad_norm": 1.0984879461780905, + "learning_rate": 1.1731257592466248e-05, + "loss": 0.555, + "step": 5946 + }, + { + "epoch": 0.46, + "grad_norm": 1.0896925735902507, + "learning_rate": 1.1728782808779126e-05, + "loss": 0.5817, + "step": 5947 + }, + { + "epoch": 0.46, + "grad_norm": 1.1967637471018489, + "learning_rate": 1.172630791594503e-05, + "loss": 0.5933, + "step": 5948 + }, + { + "epoch": 0.46, + "grad_norm": 1.0948715454249143, + "learning_rate": 1.1723832914120203e-05, + "loss": 0.5489, + "step": 5949 + }, + { + "epoch": 0.46, + "grad_norm": 1.12736728403082, + "learning_rate": 1.1721357803460915e-05, + "loss": 0.5393, + "step": 5950 + }, + { + "epoch": 0.46, + "grad_norm": 1.1518496213653773, + "learning_rate": 1.1718882584123425e-05, + "loss": 0.5458, + "step": 5951 + }, + { + "epoch": 0.46, + "grad_norm": 1.1689298114051585, + "learning_rate": 1.1716407256264014e-05, + "loss": 0.5602, + "step": 5952 + }, + { + "epoch": 0.46, + "grad_norm": 1.1658066349821727, + "learning_rate": 1.1713931820038952e-05, + "loss": 0.5317, + "step": 5953 + }, + { + "epoch": 0.46, + "grad_norm": 1.1068523900519198, + "learning_rate": 1.1711456275604534e-05, + "loss": 0.595, + "step": 5954 + }, + { + "epoch": 0.46, + "grad_norm": 1.0445035578410105, + "learning_rate": 1.170898062311705e-05, + "loss": 0.5791, + "step": 5955 + }, + { + "epoch": 0.46, + "grad_norm": 1.0967450143680333, + "learning_rate": 1.1706504862732801e-05, + "loss": 0.527, + "step": 5956 + }, + { + "epoch": 0.46, + "grad_norm": 1.2187901514726485, + "learning_rate": 1.17040289946081e-05, + "loss": 0.6348, + "step": 5957 + }, + { + "epoch": 0.46, + "grad_norm": 1.1074112718400848, + "learning_rate": 1.1701553018899255e-05, + "loss": 0.4888, + "step": 5958 + }, + { + "epoch": 0.46, + "grad_norm": 1.1410849179622504, + "learning_rate": 1.1699076935762585e-05, + "loss": 0.5592, + "step": 5959 + }, + { + "epoch": 0.46, + "grad_norm": 1.206214653248334, + "learning_rate": 1.1696600745354427e-05, + "loss": 0.5846, + "step": 5960 + }, + { + "epoch": 0.46, + "grad_norm": 1.1250317357143491, + "learning_rate": 1.1694124447831108e-05, + "loss": 0.5303, + "step": 5961 + }, + { + "epoch": 0.46, + "grad_norm": 1.008943676986175, + "learning_rate": 1.1691648043348972e-05, + "loss": 0.4977, + "step": 5962 + }, + { + "epoch": 0.46, + "grad_norm": 1.1748748022704938, + "learning_rate": 1.1689171532064371e-05, + "loss": 0.5849, + "step": 5963 + }, + { + "epoch": 0.46, + "grad_norm": 1.223466988277587, + "learning_rate": 1.1686694914133652e-05, + "loss": 0.5684, + "step": 5964 + }, + { + "epoch": 0.46, + "grad_norm": 1.276509750565055, + "learning_rate": 1.1684218189713183e-05, + "loss": 0.5751, + "step": 5965 + }, + { + "epoch": 0.46, + "grad_norm": 1.2072021955898635, + "learning_rate": 1.1681741358959328e-05, + "loss": 0.5492, + "step": 5966 + }, + { + "epoch": 0.46, + "grad_norm": 1.2206299294461564, + "learning_rate": 1.1679264422028469e-05, + "loss": 0.6172, + "step": 5967 + }, + { + "epoch": 0.46, + "grad_norm": 1.2630238588696714, + "learning_rate": 1.167678737907698e-05, + "loss": 0.5606, + "step": 5968 + }, + { + "epoch": 0.46, + "grad_norm": 1.0384634485254356, + "learning_rate": 1.1674310230261251e-05, + "loss": 0.5534, + "step": 5969 + }, + { + "epoch": 0.46, + "grad_norm": 1.0826442373090766, + "learning_rate": 1.167183297573768e-05, + "loss": 0.5247, + "step": 5970 + }, + { + "epoch": 0.46, + "grad_norm": 1.135455501253906, + "learning_rate": 1.166935561566267e-05, + "loss": 0.5342, + "step": 5971 + }, + { + "epoch": 0.46, + "grad_norm": 1.1690433113748349, + "learning_rate": 1.1666878150192626e-05, + "loss": 0.5679, + "step": 5972 + }, + { + "epoch": 0.46, + "grad_norm": 1.1313390686209732, + "learning_rate": 1.1664400579483965e-05, + "loss": 0.5665, + "step": 5973 + }, + { + "epoch": 0.46, + "grad_norm": 1.1735206747228935, + "learning_rate": 1.1661922903693107e-05, + "loss": 0.5417, + "step": 5974 + }, + { + "epoch": 0.46, + "grad_norm": 1.1505834032586726, + "learning_rate": 1.165944512297648e-05, + "loss": 0.5898, + "step": 5975 + }, + { + "epoch": 0.46, + "grad_norm": 1.0794256905921384, + "learning_rate": 1.1656967237490524e-05, + "loss": 0.5122, + "step": 5976 + }, + { + "epoch": 0.46, + "grad_norm": 1.1225912480128515, + "learning_rate": 1.1654489247391678e-05, + "loss": 0.5448, + "step": 5977 + }, + { + "epoch": 0.46, + "grad_norm": 1.2004672332956667, + "learning_rate": 1.1652011152836388e-05, + "loss": 0.5809, + "step": 5978 + }, + { + "epoch": 0.46, + "grad_norm": 1.1600084306969605, + "learning_rate": 1.1649532953981111e-05, + "loss": 0.6212, + "step": 5979 + }, + { + "epoch": 0.46, + "grad_norm": 1.1847955123296074, + "learning_rate": 1.1647054650982306e-05, + "loss": 0.575, + "step": 5980 + }, + { + "epoch": 0.46, + "grad_norm": 1.089012512636066, + "learning_rate": 1.1644576243996446e-05, + "loss": 0.5012, + "step": 5981 + }, + { + "epoch": 0.46, + "grad_norm": 1.2342043891531973, + "learning_rate": 1.1642097733180003e-05, + "loss": 0.5971, + "step": 5982 + }, + { + "epoch": 0.46, + "grad_norm": 1.2085579411257474, + "learning_rate": 1.1639619118689456e-05, + "loss": 0.5863, + "step": 5983 + }, + { + "epoch": 0.46, + "grad_norm": 1.1374002622537998, + "learning_rate": 1.1637140400681296e-05, + "loss": 0.5664, + "step": 5984 + }, + { + "epoch": 0.46, + "grad_norm": 1.1734640410529102, + "learning_rate": 1.1634661579312012e-05, + "loss": 0.5435, + "step": 5985 + }, + { + "epoch": 0.46, + "grad_norm": 1.1797157435794052, + "learning_rate": 1.1632182654738116e-05, + "loss": 0.5925, + "step": 5986 + }, + { + "epoch": 0.46, + "grad_norm": 1.2278237145567825, + "learning_rate": 1.1629703627116104e-05, + "loss": 0.6195, + "step": 5987 + }, + { + "epoch": 0.46, + "grad_norm": 1.0704874048930837, + "learning_rate": 1.1627224496602496e-05, + "loss": 0.5312, + "step": 5988 + }, + { + "epoch": 0.46, + "grad_norm": 1.2364541421940813, + "learning_rate": 1.1624745263353808e-05, + "loss": 0.6087, + "step": 5989 + }, + { + "epoch": 0.46, + "grad_norm": 1.1730046231958415, + "learning_rate": 1.1622265927526566e-05, + "loss": 0.5496, + "step": 5990 + }, + { + "epoch": 0.46, + "grad_norm": 1.035689594416083, + "learning_rate": 1.1619786489277312e-05, + "loss": 0.5185, + "step": 5991 + }, + { + "epoch": 0.46, + "grad_norm": 1.1143388509744039, + "learning_rate": 1.1617306948762576e-05, + "loss": 0.5245, + "step": 5992 + }, + { + "epoch": 0.46, + "grad_norm": 1.2708955916283018, + "learning_rate": 1.1614827306138912e-05, + "loss": 0.5783, + "step": 5993 + }, + { + "epoch": 0.47, + "grad_norm": 1.113705631147262, + "learning_rate": 1.1612347561562865e-05, + "loss": 0.5612, + "step": 5994 + }, + { + "epoch": 0.47, + "grad_norm": 1.1133749403764557, + "learning_rate": 1.1609867715190997e-05, + "loss": 0.5667, + "step": 5995 + }, + { + "epoch": 0.47, + "grad_norm": 1.1728270160479073, + "learning_rate": 1.1607387767179881e-05, + "loss": 0.5273, + "step": 5996 + }, + { + "epoch": 0.47, + "grad_norm": 1.132064361825146, + "learning_rate": 1.1604907717686075e-05, + "loss": 0.5189, + "step": 5997 + }, + { + "epoch": 0.47, + "grad_norm": 1.1384956655373737, + "learning_rate": 1.1602427566866166e-05, + "loss": 0.5398, + "step": 5998 + }, + { + "epoch": 0.47, + "grad_norm": 1.2144947102550459, + "learning_rate": 1.1599947314876738e-05, + "loss": 0.5539, + "step": 5999 + }, + { + "epoch": 0.47, + "grad_norm": 1.119428294759224, + "learning_rate": 1.1597466961874381e-05, + "loss": 0.5131, + "step": 6000 + }, + { + "epoch": 0.47, + "grad_norm": 1.2082339717307802, + "learning_rate": 1.1594986508015698e-05, + "loss": 0.555, + "step": 6001 + }, + { + "epoch": 0.47, + "grad_norm": 1.1382908911679626, + "learning_rate": 1.1592505953457282e-05, + "loss": 0.6189, + "step": 6002 + }, + { + "epoch": 0.47, + "grad_norm": 1.3104182491484984, + "learning_rate": 1.1590025298355749e-05, + "loss": 0.5667, + "step": 6003 + }, + { + "epoch": 0.47, + "grad_norm": 1.3662180924245277, + "learning_rate": 1.1587544542867716e-05, + "loss": 0.6686, + "step": 6004 + }, + { + "epoch": 0.47, + "grad_norm": 1.151598103980875, + "learning_rate": 1.1585063687149807e-05, + "loss": 0.5373, + "step": 6005 + }, + { + "epoch": 0.47, + "grad_norm": 1.1814977133846232, + "learning_rate": 1.158258273135865e-05, + "loss": 0.5538, + "step": 6006 + }, + { + "epoch": 0.47, + "grad_norm": 1.224022391516502, + "learning_rate": 1.158010167565088e-05, + "loss": 0.6384, + "step": 6007 + }, + { + "epoch": 0.47, + "grad_norm": 1.1989991424977446, + "learning_rate": 1.1577620520183135e-05, + "loss": 0.5239, + "step": 6008 + }, + { + "epoch": 0.47, + "grad_norm": 1.1996822393430697, + "learning_rate": 1.1575139265112072e-05, + "loss": 0.5065, + "step": 6009 + }, + { + "epoch": 0.47, + "grad_norm": 1.1306896444705934, + "learning_rate": 1.1572657910594336e-05, + "loss": 0.5249, + "step": 6010 + }, + { + "epoch": 0.47, + "grad_norm": 1.1442817702513723, + "learning_rate": 1.1570176456786597e-05, + "loss": 0.5541, + "step": 6011 + }, + { + "epoch": 0.47, + "grad_norm": 1.2164129199443567, + "learning_rate": 1.1567694903845515e-05, + "loss": 0.5739, + "step": 6012 + }, + { + "epoch": 0.47, + "grad_norm": 1.2389645781914767, + "learning_rate": 1.1565213251927764e-05, + "loss": 0.5953, + "step": 6013 + }, + { + "epoch": 0.47, + "grad_norm": 1.3176629381565335, + "learning_rate": 1.1562731501190027e-05, + "loss": 0.6181, + "step": 6014 + }, + { + "epoch": 0.47, + "grad_norm": 1.080050587087489, + "learning_rate": 1.1560249651788985e-05, + "loss": 0.4979, + "step": 6015 + }, + { + "epoch": 0.47, + "grad_norm": 1.213336610728508, + "learning_rate": 1.155776770388134e-05, + "loss": 0.5931, + "step": 6016 + }, + { + "epoch": 0.47, + "grad_norm": 1.2257029929626926, + "learning_rate": 1.1555285657623776e-05, + "loss": 0.5298, + "step": 6017 + }, + { + "epoch": 0.47, + "grad_norm": 1.199148766280429, + "learning_rate": 1.1552803513173004e-05, + "loss": 0.5782, + "step": 6018 + }, + { + "epoch": 0.47, + "grad_norm": 1.1464487099502134, + "learning_rate": 1.1550321270685739e-05, + "loss": 0.5308, + "step": 6019 + }, + { + "epoch": 0.47, + "grad_norm": 1.2900742230760318, + "learning_rate": 1.1547838930318689e-05, + "loss": 0.5616, + "step": 6020 + }, + { + "epoch": 0.47, + "grad_norm": 1.181885143863875, + "learning_rate": 1.1545356492228585e-05, + "loss": 0.5739, + "step": 6021 + }, + { + "epoch": 0.47, + "grad_norm": 1.1470877052572075, + "learning_rate": 1.1542873956572151e-05, + "loss": 0.5653, + "step": 6022 + }, + { + "epoch": 0.47, + "grad_norm": 1.0975241802219553, + "learning_rate": 1.1540391323506124e-05, + "loss": 0.5422, + "step": 6023 + }, + { + "epoch": 0.47, + "grad_norm": 1.2671880606804111, + "learning_rate": 1.1537908593187246e-05, + "loss": 0.5618, + "step": 6024 + }, + { + "epoch": 0.47, + "grad_norm": 1.2110152188863899, + "learning_rate": 1.1535425765772262e-05, + "loss": 0.5546, + "step": 6025 + }, + { + "epoch": 0.47, + "grad_norm": 1.1214674482991491, + "learning_rate": 1.1532942841417931e-05, + "loss": 0.5415, + "step": 6026 + }, + { + "epoch": 0.47, + "grad_norm": 1.3119752379819598, + "learning_rate": 1.1530459820281008e-05, + "loss": 0.6052, + "step": 6027 + }, + { + "epoch": 0.47, + "grad_norm": 1.1237423542853127, + "learning_rate": 1.1527976702518257e-05, + "loss": 0.5355, + "step": 6028 + }, + { + "epoch": 0.47, + "grad_norm": 1.1256897189583506, + "learning_rate": 1.1525493488286458e-05, + "loss": 0.5348, + "step": 6029 + }, + { + "epoch": 0.47, + "grad_norm": 1.2458725018335104, + "learning_rate": 1.1523010177742382e-05, + "loss": 0.565, + "step": 6030 + }, + { + "epoch": 0.47, + "grad_norm": 1.1306324996404287, + "learning_rate": 1.152052677104282e-05, + "loss": 0.5562, + "step": 6031 + }, + { + "epoch": 0.47, + "grad_norm": 1.1892979212134052, + "learning_rate": 1.1518043268344554e-05, + "loss": 0.6081, + "step": 6032 + }, + { + "epoch": 0.47, + "grad_norm": 1.2624901459564288, + "learning_rate": 1.1515559669804386e-05, + "loss": 0.6001, + "step": 6033 + }, + { + "epoch": 0.47, + "grad_norm": 1.1884519876965736, + "learning_rate": 1.1513075975579116e-05, + "loss": 0.5533, + "step": 6034 + }, + { + "epoch": 0.47, + "grad_norm": 1.0712219606733848, + "learning_rate": 1.1510592185825553e-05, + "loss": 0.4657, + "step": 6035 + }, + { + "epoch": 0.47, + "grad_norm": 1.229893674131583, + "learning_rate": 1.150810830070051e-05, + "loss": 0.5969, + "step": 6036 + }, + { + "epoch": 0.47, + "grad_norm": 1.1520841144377467, + "learning_rate": 1.1505624320360813e-05, + "loss": 0.5821, + "step": 6037 + }, + { + "epoch": 0.47, + "grad_norm": 1.1657764694346948, + "learning_rate": 1.1503140244963283e-05, + "loss": 0.5733, + "step": 6038 + }, + { + "epoch": 0.47, + "grad_norm": 1.1349233494726247, + "learning_rate": 1.1500656074664756e-05, + "loss": 0.5414, + "step": 6039 + }, + { + "epoch": 0.47, + "grad_norm": 1.1861377732920508, + "learning_rate": 1.1498171809622067e-05, + "loss": 0.5422, + "step": 6040 + }, + { + "epoch": 0.47, + "grad_norm": 1.150809763796626, + "learning_rate": 1.1495687449992059e-05, + "loss": 0.4986, + "step": 6041 + }, + { + "epoch": 0.47, + "grad_norm": 1.1371039298375025, + "learning_rate": 1.149320299593159e-05, + "loss": 0.5916, + "step": 6042 + }, + { + "epoch": 0.47, + "grad_norm": 1.1367008692852423, + "learning_rate": 1.149071844759751e-05, + "loss": 0.5277, + "step": 6043 + }, + { + "epoch": 0.47, + "grad_norm": 1.1330809702580076, + "learning_rate": 1.1488233805146685e-05, + "loss": 0.5555, + "step": 6044 + }, + { + "epoch": 0.47, + "grad_norm": 1.186683273538549, + "learning_rate": 1.1485749068735982e-05, + "loss": 0.5772, + "step": 6045 + }, + { + "epoch": 0.47, + "grad_norm": 1.1466717802440118, + "learning_rate": 1.148326423852227e-05, + "loss": 0.5352, + "step": 6046 + }, + { + "epoch": 0.47, + "grad_norm": 1.167547807143518, + "learning_rate": 1.1480779314662438e-05, + "loss": 0.5804, + "step": 6047 + }, + { + "epoch": 0.47, + "grad_norm": 1.290652733382439, + "learning_rate": 1.1478294297313366e-05, + "loss": 0.6415, + "step": 6048 + }, + { + "epoch": 0.47, + "grad_norm": 1.1767032088278035, + "learning_rate": 1.1475809186631947e-05, + "loss": 0.6049, + "step": 6049 + }, + { + "epoch": 0.47, + "grad_norm": 1.129735094600574, + "learning_rate": 1.1473323982775085e-05, + "loss": 0.5539, + "step": 6050 + }, + { + "epoch": 0.47, + "grad_norm": 1.177364715207251, + "learning_rate": 1.1470838685899675e-05, + "loss": 0.532, + "step": 6051 + }, + { + "epoch": 0.47, + "grad_norm": 1.1061015121030253, + "learning_rate": 1.146835329616263e-05, + "loss": 0.5021, + "step": 6052 + }, + { + "epoch": 0.47, + "grad_norm": 1.1822799592411755, + "learning_rate": 1.1465867813720865e-05, + "loss": 0.5967, + "step": 6053 + }, + { + "epoch": 0.47, + "grad_norm": 1.1386216743094857, + "learning_rate": 1.1463382238731305e-05, + "loss": 0.5561, + "step": 6054 + }, + { + "epoch": 0.47, + "grad_norm": 1.1723453849243168, + "learning_rate": 1.1460896571350875e-05, + "loss": 0.5998, + "step": 6055 + }, + { + "epoch": 0.47, + "grad_norm": 1.210706331891644, + "learning_rate": 1.1458410811736503e-05, + "loss": 0.5615, + "step": 6056 + }, + { + "epoch": 0.47, + "grad_norm": 1.1605290286021857, + "learning_rate": 1.1455924960045136e-05, + "loss": 0.5542, + "step": 6057 + }, + { + "epoch": 0.47, + "grad_norm": 1.3127881142599516, + "learning_rate": 1.145343901643371e-05, + "loss": 0.5599, + "step": 6058 + }, + { + "epoch": 0.47, + "grad_norm": 1.1901311836542792, + "learning_rate": 1.1450952981059182e-05, + "loss": 0.5495, + "step": 6059 + }, + { + "epoch": 0.47, + "grad_norm": 1.2635045124995357, + "learning_rate": 1.144846685407851e-05, + "loss": 0.623, + "step": 6060 + }, + { + "epoch": 0.47, + "grad_norm": 1.236935868425668, + "learning_rate": 1.1445980635648649e-05, + "loss": 0.5437, + "step": 6061 + }, + { + "epoch": 0.47, + "grad_norm": 1.1302637827876, + "learning_rate": 1.1443494325926572e-05, + "loss": 0.5638, + "step": 6062 + }, + { + "epoch": 0.47, + "grad_norm": 1.2087317774443476, + "learning_rate": 1.1441007925069248e-05, + "loss": 0.5703, + "step": 6063 + }, + { + "epoch": 0.47, + "grad_norm": 1.2696397470915577, + "learning_rate": 1.1438521433233657e-05, + "loss": 0.5883, + "step": 6064 + }, + { + "epoch": 0.47, + "grad_norm": 1.0951124697019443, + "learning_rate": 1.1436034850576794e-05, + "loss": 0.5382, + "step": 6065 + }, + { + "epoch": 0.47, + "grad_norm": 1.0863121539069758, + "learning_rate": 1.1433548177255638e-05, + "loss": 0.5102, + "step": 6066 + }, + { + "epoch": 0.47, + "grad_norm": 1.2123883815396206, + "learning_rate": 1.143106141342719e-05, + "loss": 0.5828, + "step": 6067 + }, + { + "epoch": 0.47, + "grad_norm": 1.1480736707615016, + "learning_rate": 1.1428574559248448e-05, + "loss": 0.5817, + "step": 6068 + }, + { + "epoch": 0.47, + "grad_norm": 1.1037466701919598, + "learning_rate": 1.1426087614876424e-05, + "loss": 0.5306, + "step": 6069 + }, + { + "epoch": 0.47, + "grad_norm": 1.171892750923502, + "learning_rate": 1.1423600580468137e-05, + "loss": 0.5238, + "step": 6070 + }, + { + "epoch": 0.47, + "grad_norm": 1.21161450565429, + "learning_rate": 1.1421113456180597e-05, + "loss": 0.5793, + "step": 6071 + }, + { + "epoch": 0.47, + "grad_norm": 1.145258909795778, + "learning_rate": 1.1418626242170833e-05, + "loss": 0.5717, + "step": 6072 + }, + { + "epoch": 0.47, + "grad_norm": 1.1455312793145906, + "learning_rate": 1.1416138938595874e-05, + "loss": 0.5468, + "step": 6073 + }, + { + "epoch": 0.47, + "grad_norm": 1.1801996193160358, + "learning_rate": 1.1413651545612758e-05, + "loss": 0.566, + "step": 6074 + }, + { + "epoch": 0.47, + "grad_norm": 1.1543277536254237, + "learning_rate": 1.1411164063378529e-05, + "loss": 0.5614, + "step": 6075 + }, + { + "epoch": 0.47, + "grad_norm": 1.2549773303108371, + "learning_rate": 1.1408676492050229e-05, + "loss": 0.5937, + "step": 6076 + }, + { + "epoch": 0.47, + "grad_norm": 1.0943050610967415, + "learning_rate": 1.1406188831784912e-05, + "loss": 0.5143, + "step": 6077 + }, + { + "epoch": 0.47, + "grad_norm": 1.2521838185404046, + "learning_rate": 1.1403701082739644e-05, + "loss": 0.6241, + "step": 6078 + }, + { + "epoch": 0.47, + "grad_norm": 1.2241330231799306, + "learning_rate": 1.1401213245071481e-05, + "loss": 0.591, + "step": 6079 + }, + { + "epoch": 0.47, + "grad_norm": 1.1258022838483297, + "learning_rate": 1.1398725318937503e-05, + "loss": 0.5129, + "step": 6080 + }, + { + "epoch": 0.47, + "grad_norm": 1.251965408145916, + "learning_rate": 1.1396237304494772e-05, + "loss": 0.5122, + "step": 6081 + }, + { + "epoch": 0.47, + "grad_norm": 1.1887100980782717, + "learning_rate": 1.1393749201900377e-05, + "loss": 0.5747, + "step": 6082 + }, + { + "epoch": 0.47, + "grad_norm": 1.1874656672282982, + "learning_rate": 1.1391261011311407e-05, + "loss": 0.513, + "step": 6083 + }, + { + "epoch": 0.47, + "grad_norm": 1.0929043361614448, + "learning_rate": 1.138877273288495e-05, + "loss": 0.5162, + "step": 6084 + }, + { + "epoch": 0.47, + "grad_norm": 1.0594942549027224, + "learning_rate": 1.1386284366778106e-05, + "loss": 0.506, + "step": 6085 + }, + { + "epoch": 0.47, + "grad_norm": 1.237677345669188, + "learning_rate": 1.1383795913147978e-05, + "loss": 0.5822, + "step": 6086 + }, + { + "epoch": 0.47, + "grad_norm": 1.0246579413587495, + "learning_rate": 1.138130737215167e-05, + "loss": 0.4975, + "step": 6087 + }, + { + "epoch": 0.47, + "grad_norm": 1.1503271010885996, + "learning_rate": 1.1378818743946308e-05, + "loss": 0.5731, + "step": 6088 + }, + { + "epoch": 0.47, + "grad_norm": 1.3168991931780432, + "learning_rate": 1.1376330028689e-05, + "loss": 0.54, + "step": 6089 + }, + { + "epoch": 0.47, + "grad_norm": 1.1430622346810895, + "learning_rate": 1.137384122653688e-05, + "loss": 0.516, + "step": 6090 + }, + { + "epoch": 0.47, + "grad_norm": 1.0391458463584249, + "learning_rate": 1.137135233764707e-05, + "loss": 0.5263, + "step": 6091 + }, + { + "epoch": 0.47, + "grad_norm": 1.2606152412214606, + "learning_rate": 1.1368863362176713e-05, + "loss": 0.5872, + "step": 6092 + }, + { + "epoch": 0.47, + "grad_norm": 1.2150401848948964, + "learning_rate": 1.1366374300282954e-05, + "loss": 0.5402, + "step": 6093 + }, + { + "epoch": 0.47, + "grad_norm": 1.2435444551780566, + "learning_rate": 1.1363885152122933e-05, + "loss": 0.5731, + "step": 6094 + }, + { + "epoch": 0.47, + "grad_norm": 1.1744543553654667, + "learning_rate": 1.1361395917853808e-05, + "loss": 0.5379, + "step": 6095 + }, + { + "epoch": 0.47, + "grad_norm": 1.2422188329043826, + "learning_rate": 1.1358906597632731e-05, + "loss": 0.5761, + "step": 6096 + }, + { + "epoch": 0.47, + "grad_norm": 1.169244535132021, + "learning_rate": 1.135641719161687e-05, + "loss": 0.5983, + "step": 6097 + }, + { + "epoch": 0.47, + "grad_norm": 1.1300913256478453, + "learning_rate": 1.1353927699963396e-05, + "loss": 0.5251, + "step": 6098 + }, + { + "epoch": 0.47, + "grad_norm": 1.2876857456920567, + "learning_rate": 1.135143812282948e-05, + "loss": 0.5808, + "step": 6099 + }, + { + "epoch": 0.47, + "grad_norm": 1.1864677761507028, + "learning_rate": 1.1348948460372302e-05, + "loss": 0.5515, + "step": 6100 + }, + { + "epoch": 0.47, + "grad_norm": 1.0960707430284047, + "learning_rate": 1.1346458712749049e-05, + "loss": 0.5575, + "step": 6101 + }, + { + "epoch": 0.47, + "grad_norm": 1.1661438735643315, + "learning_rate": 1.1343968880116907e-05, + "loss": 0.5612, + "step": 6102 + }, + { + "epoch": 0.47, + "grad_norm": 1.1487139414543932, + "learning_rate": 1.1341478962633081e-05, + "loss": 0.5233, + "step": 6103 + }, + { + "epoch": 0.47, + "grad_norm": 1.0363604756585434, + "learning_rate": 1.1338988960454763e-05, + "loss": 0.4851, + "step": 6104 + }, + { + "epoch": 0.47, + "grad_norm": 1.1794222603017963, + "learning_rate": 1.1336498873739166e-05, + "loss": 0.5996, + "step": 6105 + }, + { + "epoch": 0.47, + "grad_norm": 1.2842968789355962, + "learning_rate": 1.13340087026435e-05, + "loss": 0.6117, + "step": 6106 + }, + { + "epoch": 0.47, + "grad_norm": 1.1565401383397298, + "learning_rate": 1.1331518447324978e-05, + "loss": 0.5492, + "step": 6107 + }, + { + "epoch": 0.47, + "grad_norm": 1.2901046239579763, + "learning_rate": 1.1329028107940832e-05, + "loss": 0.6147, + "step": 6108 + }, + { + "epoch": 0.47, + "grad_norm": 1.0247321057884973, + "learning_rate": 1.1326537684648282e-05, + "loss": 0.5042, + "step": 6109 + }, + { + "epoch": 0.47, + "grad_norm": 1.1642414441915516, + "learning_rate": 1.1324047177604565e-05, + "loss": 0.5156, + "step": 6110 + }, + { + "epoch": 0.47, + "grad_norm": 1.2482839726279775, + "learning_rate": 1.1321556586966917e-05, + "loss": 0.5936, + "step": 6111 + }, + { + "epoch": 0.47, + "grad_norm": 1.1575613060817325, + "learning_rate": 1.1319065912892584e-05, + "loss": 0.5384, + "step": 6112 + }, + { + "epoch": 0.47, + "grad_norm": 1.2922929455541055, + "learning_rate": 1.1316575155538816e-05, + "loss": 0.6094, + "step": 6113 + }, + { + "epoch": 0.47, + "grad_norm": 1.1511815821792062, + "learning_rate": 1.1314084315062863e-05, + "loss": 0.5347, + "step": 6114 + }, + { + "epoch": 0.47, + "grad_norm": 1.1563639713512337, + "learning_rate": 1.131159339162199e-05, + "loss": 0.5689, + "step": 6115 + }, + { + "epoch": 0.47, + "grad_norm": 1.1636079758498936, + "learning_rate": 1.1309102385373459e-05, + "loss": 0.5629, + "step": 6116 + }, + { + "epoch": 0.47, + "grad_norm": 1.2872494518599655, + "learning_rate": 1.1306611296474536e-05, + "loss": 0.5658, + "step": 6117 + }, + { + "epoch": 0.47, + "grad_norm": 1.1639668982246267, + "learning_rate": 1.1304120125082504e-05, + "loss": 0.5931, + "step": 6118 + }, + { + "epoch": 0.47, + "grad_norm": 1.1237137116714113, + "learning_rate": 1.1301628871354641e-05, + "loss": 0.5275, + "step": 6119 + }, + { + "epoch": 0.47, + "grad_norm": 1.1171285973873313, + "learning_rate": 1.129913753544823e-05, + "loss": 0.5402, + "step": 6120 + }, + { + "epoch": 0.47, + "grad_norm": 1.2248233687287509, + "learning_rate": 1.1296646117520567e-05, + "loss": 0.5971, + "step": 6121 + }, + { + "epoch": 0.47, + "grad_norm": 1.144457557571205, + "learning_rate": 1.1294154617728942e-05, + "loss": 0.5616, + "step": 6122 + }, + { + "epoch": 0.48, + "grad_norm": 1.128843206737472, + "learning_rate": 1.1291663036230658e-05, + "loss": 0.6098, + "step": 6123 + }, + { + "epoch": 0.48, + "grad_norm": 1.1859439392317106, + "learning_rate": 1.1289171373183026e-05, + "loss": 0.5655, + "step": 6124 + }, + { + "epoch": 0.48, + "grad_norm": 1.2119237021801355, + "learning_rate": 1.1286679628743349e-05, + "loss": 0.5533, + "step": 6125 + }, + { + "epoch": 0.48, + "grad_norm": 1.2164796565746279, + "learning_rate": 1.1284187803068953e-05, + "loss": 0.6083, + "step": 6126 + }, + { + "epoch": 0.48, + "grad_norm": 1.1327797194374818, + "learning_rate": 1.1281695896317153e-05, + "loss": 0.5423, + "step": 6127 + }, + { + "epoch": 0.48, + "grad_norm": 1.2823281287869477, + "learning_rate": 1.127920390864528e-05, + "loss": 0.5764, + "step": 6128 + }, + { + "epoch": 0.48, + "grad_norm": 1.070036748183764, + "learning_rate": 1.1276711840210663e-05, + "loss": 0.5204, + "step": 6129 + }, + { + "epoch": 0.48, + "grad_norm": 1.2320117781992606, + "learning_rate": 1.127421969117064e-05, + "loss": 0.5419, + "step": 6130 + }, + { + "epoch": 0.48, + "grad_norm": 1.1750960270215633, + "learning_rate": 1.1271727461682558e-05, + "loss": 0.5746, + "step": 6131 + }, + { + "epoch": 0.48, + "grad_norm": 1.2556764935216203, + "learning_rate": 1.1269235151903754e-05, + "loss": 0.5761, + "step": 6132 + }, + { + "epoch": 0.48, + "grad_norm": 1.1858653814328417, + "learning_rate": 1.126674276199159e-05, + "loss": 0.5399, + "step": 6133 + }, + { + "epoch": 0.48, + "grad_norm": 1.120311610435654, + "learning_rate": 1.1264250292103423e-05, + "loss": 0.4987, + "step": 6134 + }, + { + "epoch": 0.48, + "grad_norm": 1.2430094272903576, + "learning_rate": 1.1261757742396606e-05, + "loss": 0.6163, + "step": 6135 + }, + { + "epoch": 0.48, + "grad_norm": 1.1713331877357887, + "learning_rate": 1.1259265113028517e-05, + "loss": 0.567, + "step": 6136 + }, + { + "epoch": 0.48, + "grad_norm": 1.2023741125247693, + "learning_rate": 1.1256772404156521e-05, + "loss": 0.5412, + "step": 6137 + }, + { + "epoch": 0.48, + "grad_norm": 1.1715327462452352, + "learning_rate": 1.1254279615938001e-05, + "loss": 0.603, + "step": 6138 + }, + { + "epoch": 0.48, + "grad_norm": 1.1307029813630516, + "learning_rate": 1.1251786748530342e-05, + "loss": 0.5531, + "step": 6139 + }, + { + "epoch": 0.48, + "grad_norm": 1.1911774962897999, + "learning_rate": 1.124929380209092e-05, + "loss": 0.5529, + "step": 6140 + }, + { + "epoch": 0.48, + "grad_norm": 1.1671098593916343, + "learning_rate": 1.124680077677714e-05, + "loss": 0.5478, + "step": 6141 + }, + { + "epoch": 0.48, + "grad_norm": 1.209040528011753, + "learning_rate": 1.124430767274639e-05, + "loss": 0.6081, + "step": 6142 + }, + { + "epoch": 0.48, + "grad_norm": 1.1465392221462014, + "learning_rate": 1.124181449015608e-05, + "loss": 0.58, + "step": 6143 + }, + { + "epoch": 0.48, + "grad_norm": 1.131175627819122, + "learning_rate": 1.1239321229163615e-05, + "loss": 0.5669, + "step": 6144 + }, + { + "epoch": 0.48, + "grad_norm": 1.2186527213374134, + "learning_rate": 1.1236827889926402e-05, + "loss": 0.5652, + "step": 6145 + }, + { + "epoch": 0.48, + "grad_norm": 1.2058958867967815, + "learning_rate": 1.1234334472601868e-05, + "loss": 0.607, + "step": 6146 + }, + { + "epoch": 0.48, + "grad_norm": 1.2508591560343088, + "learning_rate": 1.1231840977347427e-05, + "loss": 0.5752, + "step": 6147 + }, + { + "epoch": 0.48, + "grad_norm": 1.3113149561034705, + "learning_rate": 1.1229347404320515e-05, + "loss": 0.6231, + "step": 6148 + }, + { + "epoch": 0.48, + "grad_norm": 1.1797706119994518, + "learning_rate": 1.1226853753678555e-05, + "loss": 0.5521, + "step": 6149 + }, + { + "epoch": 0.48, + "grad_norm": 1.1573637675782535, + "learning_rate": 1.1224360025578987e-05, + "loss": 0.5661, + "step": 6150 + }, + { + "epoch": 0.48, + "grad_norm": 1.0177478612110338, + "learning_rate": 1.1221866220179254e-05, + "loss": 0.5118, + "step": 6151 + }, + { + "epoch": 0.48, + "grad_norm": 1.2031172714976002, + "learning_rate": 1.1219372337636802e-05, + "loss": 0.5805, + "step": 6152 + }, + { + "epoch": 0.48, + "grad_norm": 1.1654660768551133, + "learning_rate": 1.1216878378109085e-05, + "loss": 0.591, + "step": 6153 + }, + { + "epoch": 0.48, + "grad_norm": 1.278893000791008, + "learning_rate": 1.1214384341753557e-05, + "loss": 0.5893, + "step": 6154 + }, + { + "epoch": 0.48, + "grad_norm": 1.0944814688384625, + "learning_rate": 1.1211890228727679e-05, + "loss": 0.5593, + "step": 6155 + }, + { + "epoch": 0.48, + "grad_norm": 1.2107200673460006, + "learning_rate": 1.120939603918892e-05, + "loss": 0.5401, + "step": 6156 + }, + { + "epoch": 0.48, + "grad_norm": 1.1891111186009367, + "learning_rate": 1.1206901773294749e-05, + "loss": 0.5467, + "step": 6157 + }, + { + "epoch": 0.48, + "grad_norm": 1.2391264524948702, + "learning_rate": 1.1204407431202642e-05, + "loss": 0.5943, + "step": 6158 + }, + { + "epoch": 0.48, + "grad_norm": 1.1448040009037483, + "learning_rate": 1.120191301307008e-05, + "loss": 0.553, + "step": 6159 + }, + { + "epoch": 0.48, + "grad_norm": 1.0451598279734644, + "learning_rate": 1.1199418519054549e-05, + "loss": 0.4847, + "step": 6160 + }, + { + "epoch": 0.48, + "grad_norm": 1.1327962939977638, + "learning_rate": 1.1196923949313537e-05, + "loss": 0.5197, + "step": 6161 + }, + { + "epoch": 0.48, + "grad_norm": 1.1831119173498421, + "learning_rate": 1.1194429304004541e-05, + "loss": 0.5999, + "step": 6162 + }, + { + "epoch": 0.48, + "grad_norm": 1.211938997637693, + "learning_rate": 1.1191934583285063e-05, + "loss": 0.5342, + "step": 6163 + }, + { + "epoch": 0.48, + "grad_norm": 1.0937776834526711, + "learning_rate": 1.1189439787312603e-05, + "loss": 0.5601, + "step": 6164 + }, + { + "epoch": 0.48, + "grad_norm": 1.1182546787108651, + "learning_rate": 1.1186944916244673e-05, + "loss": 0.5559, + "step": 6165 + }, + { + "epoch": 0.48, + "grad_norm": 1.4341959922639842, + "learning_rate": 1.1184449970238787e-05, + "loss": 0.5554, + "step": 6166 + }, + { + "epoch": 0.48, + "grad_norm": 1.1721113856638508, + "learning_rate": 1.1181954949452463e-05, + "loss": 0.5801, + "step": 6167 + }, + { + "epoch": 0.48, + "grad_norm": 1.1589267265091858, + "learning_rate": 1.1179459854043227e-05, + "loss": 0.5391, + "step": 6168 + }, + { + "epoch": 0.48, + "grad_norm": 1.1216112597206436, + "learning_rate": 1.1176964684168603e-05, + "loss": 0.5426, + "step": 6169 + }, + { + "epoch": 0.48, + "grad_norm": 1.1056470962998983, + "learning_rate": 1.1174469439986126e-05, + "loss": 0.5618, + "step": 6170 + }, + { + "epoch": 0.48, + "grad_norm": 1.2247803004468978, + "learning_rate": 1.1171974121653333e-05, + "loss": 0.5663, + "step": 6171 + }, + { + "epoch": 0.48, + "grad_norm": 1.1264885485287592, + "learning_rate": 1.116947872932777e-05, + "loss": 0.5633, + "step": 6172 + }, + { + "epoch": 0.48, + "grad_norm": 1.1898257919829878, + "learning_rate": 1.1166983263166979e-05, + "loss": 0.5433, + "step": 6173 + }, + { + "epoch": 0.48, + "grad_norm": 1.1917158381225335, + "learning_rate": 1.1164487723328516e-05, + "loss": 0.5849, + "step": 6174 + }, + { + "epoch": 0.48, + "grad_norm": 1.0908833675891934, + "learning_rate": 1.1161992109969932e-05, + "loss": 0.5728, + "step": 6175 + }, + { + "epoch": 0.48, + "grad_norm": 1.2274191228012494, + "learning_rate": 1.115949642324879e-05, + "loss": 0.6096, + "step": 6176 + }, + { + "epoch": 0.48, + "grad_norm": 1.140941001282802, + "learning_rate": 1.1157000663322662e-05, + "loss": 0.5523, + "step": 6177 + }, + { + "epoch": 0.48, + "grad_norm": 1.2186166250168242, + "learning_rate": 1.115450483034911e-05, + "loss": 0.5749, + "step": 6178 + }, + { + "epoch": 0.48, + "grad_norm": 1.1872916289770912, + "learning_rate": 1.115200892448571e-05, + "loss": 0.5628, + "step": 6179 + }, + { + "epoch": 0.48, + "grad_norm": 1.1816432479955392, + "learning_rate": 1.1149512945890044e-05, + "loss": 0.5599, + "step": 6180 + }, + { + "epoch": 0.48, + "grad_norm": 1.1784970674959776, + "learning_rate": 1.1147016894719695e-05, + "loss": 0.5754, + "step": 6181 + }, + { + "epoch": 0.48, + "grad_norm": 1.1310563253615806, + "learning_rate": 1.1144520771132252e-05, + "loss": 0.534, + "step": 6182 + }, + { + "epoch": 0.48, + "grad_norm": 1.1835764767233026, + "learning_rate": 1.1142024575285308e-05, + "loss": 0.5458, + "step": 6183 + }, + { + "epoch": 0.48, + "grad_norm": 1.1392449168798588, + "learning_rate": 1.1139528307336463e-05, + "loss": 0.5747, + "step": 6184 + }, + { + "epoch": 0.48, + "grad_norm": 1.158187634523703, + "learning_rate": 1.1137031967443312e-05, + "loss": 0.5877, + "step": 6185 + }, + { + "epoch": 0.48, + "grad_norm": 1.2891116046222504, + "learning_rate": 1.1134535555763466e-05, + "loss": 0.5957, + "step": 6186 + }, + { + "epoch": 0.48, + "grad_norm": 1.1423768999322064, + "learning_rate": 1.113203907245454e-05, + "loss": 0.5516, + "step": 6187 + }, + { + "epoch": 0.48, + "grad_norm": 1.1826018152096238, + "learning_rate": 1.1129542517674147e-05, + "loss": 0.57, + "step": 6188 + }, + { + "epoch": 0.48, + "grad_norm": 1.1678530020117521, + "learning_rate": 1.1127045891579906e-05, + "loss": 0.4987, + "step": 6189 + }, + { + "epoch": 0.48, + "grad_norm": 1.1482155514031898, + "learning_rate": 1.1124549194329445e-05, + "loss": 0.5555, + "step": 6190 + }, + { + "epoch": 0.48, + "grad_norm": 1.1638159778875492, + "learning_rate": 1.112205242608039e-05, + "loss": 0.5227, + "step": 6191 + }, + { + "epoch": 0.48, + "grad_norm": 1.3314113572883042, + "learning_rate": 1.1119555586990376e-05, + "loss": 0.6453, + "step": 6192 + }, + { + "epoch": 0.48, + "grad_norm": 1.1504087592413548, + "learning_rate": 1.1117058677217043e-05, + "loss": 0.5506, + "step": 6193 + }, + { + "epoch": 0.48, + "grad_norm": 1.1109482261687162, + "learning_rate": 1.111456169691803e-05, + "loss": 0.5282, + "step": 6194 + }, + { + "epoch": 0.48, + "grad_norm": 1.2655304767491267, + "learning_rate": 1.1112064646250988e-05, + "loss": 0.5556, + "step": 6195 + }, + { + "epoch": 0.48, + "grad_norm": 1.1314574457140631, + "learning_rate": 1.1109567525373569e-05, + "loss": 0.4926, + "step": 6196 + }, + { + "epoch": 0.48, + "grad_norm": 1.2467107889370879, + "learning_rate": 1.1107070334443426e-05, + "loss": 0.6065, + "step": 6197 + }, + { + "epoch": 0.48, + "grad_norm": 1.098095083247121, + "learning_rate": 1.1104573073618222e-05, + "loss": 0.5326, + "step": 6198 + }, + { + "epoch": 0.48, + "grad_norm": 1.0949034467133698, + "learning_rate": 1.1102075743055618e-05, + "loss": 0.5128, + "step": 6199 + }, + { + "epoch": 0.48, + "grad_norm": 1.2051167536005194, + "learning_rate": 1.1099578342913289e-05, + "loss": 0.5695, + "step": 6200 + }, + { + "epoch": 0.48, + "grad_norm": 1.1850271075403587, + "learning_rate": 1.1097080873348905e-05, + "loss": 0.598, + "step": 6201 + }, + { + "epoch": 0.48, + "grad_norm": 1.270252574525389, + "learning_rate": 1.1094583334520146e-05, + "loss": 0.6388, + "step": 6202 + }, + { + "epoch": 0.48, + "grad_norm": 1.1029827620576405, + "learning_rate": 1.1092085726584693e-05, + "loss": 0.5326, + "step": 6203 + }, + { + "epoch": 0.48, + "grad_norm": 1.1215251664064685, + "learning_rate": 1.1089588049700234e-05, + "loss": 0.5437, + "step": 6204 + }, + { + "epoch": 0.48, + "grad_norm": 1.155188459816482, + "learning_rate": 1.1087090304024462e-05, + "loss": 0.544, + "step": 6205 + }, + { + "epoch": 0.48, + "grad_norm": 1.1474209912428457, + "learning_rate": 1.1084592489715067e-05, + "loss": 0.5899, + "step": 6206 + }, + { + "epoch": 0.48, + "grad_norm": 1.2369202556301464, + "learning_rate": 1.1082094606929754e-05, + "loss": 0.5627, + "step": 6207 + }, + { + "epoch": 0.48, + "grad_norm": 1.2608004792179852, + "learning_rate": 1.1079596655826227e-05, + "loss": 0.5612, + "step": 6208 + }, + { + "epoch": 0.48, + "grad_norm": 1.1596070604892992, + "learning_rate": 1.1077098636562191e-05, + "loss": 0.5742, + "step": 6209 + }, + { + "epoch": 0.48, + "grad_norm": 1.1999462791180133, + "learning_rate": 1.1074600549295363e-05, + "loss": 0.5375, + "step": 6210 + }, + { + "epoch": 0.48, + "grad_norm": 1.1406132945675063, + "learning_rate": 1.1072102394183456e-05, + "loss": 0.6112, + "step": 6211 + }, + { + "epoch": 0.48, + "grad_norm": 1.1389317420454288, + "learning_rate": 1.1069604171384194e-05, + "loss": 0.5766, + "step": 6212 + }, + { + "epoch": 0.48, + "grad_norm": 1.21567475087168, + "learning_rate": 1.1067105881055303e-05, + "loss": 0.61, + "step": 6213 + }, + { + "epoch": 0.48, + "grad_norm": 1.1913558198855603, + "learning_rate": 1.106460752335451e-05, + "loss": 0.5465, + "step": 6214 + }, + { + "epoch": 0.48, + "grad_norm": 1.2962121763385255, + "learning_rate": 1.1062109098439555e-05, + "loss": 0.5838, + "step": 6215 + }, + { + "epoch": 0.48, + "grad_norm": 1.1993759996530284, + "learning_rate": 1.105961060646817e-05, + "loss": 0.5563, + "step": 6216 + }, + { + "epoch": 0.48, + "grad_norm": 1.2047749884634675, + "learning_rate": 1.1057112047598102e-05, + "loss": 0.5677, + "step": 6217 + }, + { + "epoch": 0.48, + "grad_norm": 1.0570390271043815, + "learning_rate": 1.1054613421987098e-05, + "loss": 0.5383, + "step": 6218 + }, + { + "epoch": 0.48, + "grad_norm": 1.188914761281565, + "learning_rate": 1.1052114729792902e-05, + "loss": 0.5815, + "step": 6219 + }, + { + "epoch": 0.48, + "grad_norm": 1.2046005811394422, + "learning_rate": 1.104961597117328e-05, + "loss": 0.5691, + "step": 6220 + }, + { + "epoch": 0.48, + "grad_norm": 1.1500618275317596, + "learning_rate": 1.1047117146285984e-05, + "loss": 0.5111, + "step": 6221 + }, + { + "epoch": 0.48, + "grad_norm": 1.0534887212105184, + "learning_rate": 1.1044618255288781e-05, + "loss": 0.5623, + "step": 6222 + }, + { + "epoch": 0.48, + "grad_norm": 1.1762681124161223, + "learning_rate": 1.104211929833944e-05, + "loss": 0.54, + "step": 6223 + }, + { + "epoch": 0.48, + "grad_norm": 1.0849735729673213, + "learning_rate": 1.1039620275595729e-05, + "loss": 0.5644, + "step": 6224 + }, + { + "epoch": 0.48, + "grad_norm": 1.082915182598213, + "learning_rate": 1.1037121187215427e-05, + "loss": 0.5286, + "step": 6225 + }, + { + "epoch": 0.48, + "grad_norm": 1.1481968114964451, + "learning_rate": 1.1034622033356311e-05, + "loss": 0.5686, + "step": 6226 + }, + { + "epoch": 0.48, + "grad_norm": 1.1205564799851595, + "learning_rate": 1.103212281417617e-05, + "loss": 0.5383, + "step": 6227 + }, + { + "epoch": 0.48, + "grad_norm": 1.0921265133487799, + "learning_rate": 1.1029623529832793e-05, + "loss": 0.5392, + "step": 6228 + }, + { + "epoch": 0.48, + "grad_norm": 1.04841506183565, + "learning_rate": 1.1027124180483965e-05, + "loss": 0.5416, + "step": 6229 + }, + { + "epoch": 0.48, + "grad_norm": 1.168596029084298, + "learning_rate": 1.1024624766287492e-05, + "loss": 0.5397, + "step": 6230 + }, + { + "epoch": 0.48, + "grad_norm": 1.1717812055563395, + "learning_rate": 1.1022125287401172e-05, + "loss": 0.5511, + "step": 6231 + }, + { + "epoch": 0.48, + "grad_norm": 1.248813256541239, + "learning_rate": 1.1019625743982807e-05, + "loss": 0.5685, + "step": 6232 + }, + { + "epoch": 0.48, + "grad_norm": 1.1908274761257898, + "learning_rate": 1.101712613619021e-05, + "loss": 0.5442, + "step": 6233 + }, + { + "epoch": 0.48, + "grad_norm": 1.1641961347436167, + "learning_rate": 1.1014626464181191e-05, + "loss": 0.543, + "step": 6234 + }, + { + "epoch": 0.48, + "grad_norm": 1.084253556104743, + "learning_rate": 1.1012126728113567e-05, + "loss": 0.4856, + "step": 6235 + }, + { + "epoch": 0.48, + "grad_norm": 1.1528492998561048, + "learning_rate": 1.1009626928145163e-05, + "loss": 0.6098, + "step": 6236 + }, + { + "epoch": 0.48, + "grad_norm": 1.146757700895703, + "learning_rate": 1.1007127064433802e-05, + "loss": 0.5844, + "step": 6237 + }, + { + "epoch": 0.48, + "grad_norm": 1.1120951440916955, + "learning_rate": 1.1004627137137314e-05, + "loss": 0.5116, + "step": 6238 + }, + { + "epoch": 0.48, + "grad_norm": 1.1138632340672925, + "learning_rate": 1.1002127146413531e-05, + "loss": 0.5143, + "step": 6239 + }, + { + "epoch": 0.48, + "grad_norm": 1.2878599163787527, + "learning_rate": 1.0999627092420291e-05, + "loss": 0.5805, + "step": 6240 + }, + { + "epoch": 0.48, + "grad_norm": 1.2106485329675938, + "learning_rate": 1.0997126975315433e-05, + "loss": 0.5232, + "step": 6241 + }, + { + "epoch": 0.48, + "grad_norm": 1.0727876320192133, + "learning_rate": 1.0994626795256806e-05, + "loss": 0.5055, + "step": 6242 + }, + { + "epoch": 0.48, + "grad_norm": 1.1713025538685444, + "learning_rate": 1.0992126552402261e-05, + "loss": 0.5475, + "step": 6243 + }, + { + "epoch": 0.48, + "grad_norm": 1.263902646060917, + "learning_rate": 1.0989626246909642e-05, + "loss": 0.5828, + "step": 6244 + }, + { + "epoch": 0.48, + "grad_norm": 1.2439657474659542, + "learning_rate": 1.0987125878936814e-05, + "loss": 0.5851, + "step": 6245 + }, + { + "epoch": 0.48, + "grad_norm": 1.193978556378137, + "learning_rate": 1.0984625448641639e-05, + "loss": 0.5448, + "step": 6246 + }, + { + "epoch": 0.48, + "grad_norm": 1.191254002583395, + "learning_rate": 1.0982124956181979e-05, + "loss": 0.5728, + "step": 6247 + }, + { + "epoch": 0.48, + "grad_norm": 1.1800802726435635, + "learning_rate": 1.0979624401715702e-05, + "loss": 0.5526, + "step": 6248 + }, + { + "epoch": 0.48, + "grad_norm": 1.193635449730372, + "learning_rate": 1.0977123785400684e-05, + "loss": 0.5814, + "step": 6249 + }, + { + "epoch": 0.48, + "grad_norm": 1.2722000006762226, + "learning_rate": 1.0974623107394797e-05, + "loss": 0.5663, + "step": 6250 + }, + { + "epoch": 0.48, + "grad_norm": 1.2157040705130804, + "learning_rate": 1.097212236785593e-05, + "loss": 0.5649, + "step": 6251 + }, + { + "epoch": 0.49, + "grad_norm": 1.282610519807765, + "learning_rate": 1.0969621566941959e-05, + "loss": 0.5823, + "step": 6252 + }, + { + "epoch": 0.49, + "grad_norm": 1.0865906322437042, + "learning_rate": 1.0967120704810776e-05, + "loss": 0.5579, + "step": 6253 + }, + { + "epoch": 0.49, + "grad_norm": 1.1548739828581531, + "learning_rate": 1.0964619781620271e-05, + "loss": 0.5986, + "step": 6254 + }, + { + "epoch": 0.49, + "grad_norm": 1.2150745233985072, + "learning_rate": 1.0962118797528344e-05, + "loss": 0.5374, + "step": 6255 + }, + { + "epoch": 0.49, + "grad_norm": 1.15572927966926, + "learning_rate": 1.0959617752692897e-05, + "loss": 0.5211, + "step": 6256 + }, + { + "epoch": 0.49, + "grad_norm": 1.1692748151115242, + "learning_rate": 1.0957116647271829e-05, + "loss": 0.5543, + "step": 6257 + }, + { + "epoch": 0.49, + "grad_norm": 1.0414781527011072, + "learning_rate": 1.0954615481423047e-05, + "loss": 0.5168, + "step": 6258 + }, + { + "epoch": 0.49, + "grad_norm": 1.1498510782594316, + "learning_rate": 1.0952114255304465e-05, + "loss": 0.5256, + "step": 6259 + }, + { + "epoch": 0.49, + "grad_norm": 1.1825853338763397, + "learning_rate": 1.0949612969073995e-05, + "loss": 0.5309, + "step": 6260 + }, + { + "epoch": 0.49, + "grad_norm": 1.2336561789971732, + "learning_rate": 1.0947111622889563e-05, + "loss": 0.6284, + "step": 6261 + }, + { + "epoch": 0.49, + "grad_norm": 1.1557986950972743, + "learning_rate": 1.0944610216909086e-05, + "loss": 0.5068, + "step": 6262 + }, + { + "epoch": 0.49, + "grad_norm": 1.30853617527261, + "learning_rate": 1.0942108751290494e-05, + "loss": 0.599, + "step": 6263 + }, + { + "epoch": 0.49, + "grad_norm": 1.068262461951388, + "learning_rate": 1.0939607226191716e-05, + "loss": 0.5627, + "step": 6264 + }, + { + "epoch": 0.49, + "grad_norm": 1.1874667213193926, + "learning_rate": 1.0937105641770682e-05, + "loss": 0.5241, + "step": 6265 + }, + { + "epoch": 0.49, + "grad_norm": 1.1902940405277826, + "learning_rate": 1.0934603998185338e-05, + "loss": 0.5698, + "step": 6266 + }, + { + "epoch": 0.49, + "grad_norm": 1.0901840936430078, + "learning_rate": 1.0932102295593621e-05, + "loss": 0.5125, + "step": 6267 + }, + { + "epoch": 0.49, + "grad_norm": 1.1855604494628034, + "learning_rate": 1.0929600534153477e-05, + "loss": 0.5043, + "step": 6268 + }, + { + "epoch": 0.49, + "grad_norm": 1.1662671506383175, + "learning_rate": 1.0927098714022854e-05, + "loss": 0.5761, + "step": 6269 + }, + { + "epoch": 0.49, + "grad_norm": 1.1207815659396392, + "learning_rate": 1.0924596835359706e-05, + "loss": 0.5716, + "step": 6270 + }, + { + "epoch": 0.49, + "grad_norm": 1.0947266713941182, + "learning_rate": 1.092209489832199e-05, + "loss": 0.5675, + "step": 6271 + }, + { + "epoch": 0.49, + "grad_norm": 1.327508087492767, + "learning_rate": 1.0919592903067668e-05, + "loss": 0.6027, + "step": 6272 + }, + { + "epoch": 0.49, + "grad_norm": 1.1924033073394782, + "learning_rate": 1.09170908497547e-05, + "loss": 0.5869, + "step": 6273 + }, + { + "epoch": 0.49, + "grad_norm": 1.0611016664755974, + "learning_rate": 1.0914588738541055e-05, + "loss": 0.5745, + "step": 6274 + }, + { + "epoch": 0.49, + "grad_norm": 1.0868997495793626, + "learning_rate": 1.0912086569584702e-05, + "loss": 0.537, + "step": 6275 + }, + { + "epoch": 0.49, + "grad_norm": 1.1296068281438767, + "learning_rate": 1.090958434304362e-05, + "loss": 0.5627, + "step": 6276 + }, + { + "epoch": 0.49, + "grad_norm": 1.2191840645953926, + "learning_rate": 1.0907082059075786e-05, + "loss": 0.5763, + "step": 6277 + }, + { + "epoch": 0.49, + "grad_norm": 1.052527922307498, + "learning_rate": 1.090457971783918e-05, + "loss": 0.53, + "step": 6278 + }, + { + "epoch": 0.49, + "grad_norm": 1.2259940998937324, + "learning_rate": 1.0902077319491792e-05, + "loss": 0.5499, + "step": 6279 + }, + { + "epoch": 0.49, + "grad_norm": 1.2058268837711255, + "learning_rate": 1.0899574864191607e-05, + "loss": 0.5682, + "step": 6280 + }, + { + "epoch": 0.49, + "grad_norm": 1.2088781750798747, + "learning_rate": 1.089707235209662e-05, + "loss": 0.603, + "step": 6281 + }, + { + "epoch": 0.49, + "grad_norm": 1.2780784337297917, + "learning_rate": 1.089456978336483e-05, + "loss": 0.6067, + "step": 6282 + }, + { + "epoch": 0.49, + "grad_norm": 1.174701790810231, + "learning_rate": 1.089206715815423e-05, + "loss": 0.5304, + "step": 6283 + }, + { + "epoch": 0.49, + "grad_norm": 1.1634918452666847, + "learning_rate": 1.0889564476622829e-05, + "loss": 0.5746, + "step": 6284 + }, + { + "epoch": 0.49, + "grad_norm": 1.1119609834254902, + "learning_rate": 1.0887061738928632e-05, + "loss": 0.4633, + "step": 6285 + }, + { + "epoch": 0.49, + "grad_norm": 1.1644985739083769, + "learning_rate": 1.0884558945229652e-05, + "loss": 0.5484, + "step": 6286 + }, + { + "epoch": 0.49, + "grad_norm": 1.0371188003485314, + "learning_rate": 1.0882056095683905e-05, + "loss": 0.5112, + "step": 6287 + }, + { + "epoch": 0.49, + "grad_norm": 1.0866193757464007, + "learning_rate": 1.0879553190449402e-05, + "loss": 0.5504, + "step": 6288 + }, + { + "epoch": 0.49, + "grad_norm": 1.2967099923558516, + "learning_rate": 1.0877050229684169e-05, + "loss": 0.5938, + "step": 6289 + }, + { + "epoch": 0.49, + "grad_norm": 1.1989126404982595, + "learning_rate": 1.087454721354623e-05, + "loss": 0.5652, + "step": 6290 + }, + { + "epoch": 0.49, + "grad_norm": 1.1253676343619814, + "learning_rate": 1.0872044142193614e-05, + "loss": 0.5584, + "step": 6291 + }, + { + "epoch": 0.49, + "grad_norm": 1.1026007987241102, + "learning_rate": 1.0869541015784353e-05, + "loss": 0.5484, + "step": 6292 + }, + { + "epoch": 0.49, + "grad_norm": 1.0925516921150882, + "learning_rate": 1.0867037834476477e-05, + "loss": 0.5529, + "step": 6293 + }, + { + "epoch": 0.49, + "grad_norm": 1.1207733228044954, + "learning_rate": 1.0864534598428034e-05, + "loss": 0.5793, + "step": 6294 + }, + { + "epoch": 0.49, + "grad_norm": 1.2878947199211705, + "learning_rate": 1.0862031307797059e-05, + "loss": 0.5352, + "step": 6295 + }, + { + "epoch": 0.49, + "grad_norm": 1.162445891566968, + "learning_rate": 1.08595279627416e-05, + "loss": 0.5072, + "step": 6296 + }, + { + "epoch": 0.49, + "grad_norm": 1.2525412001086693, + "learning_rate": 1.085702456341971e-05, + "loss": 0.5884, + "step": 6297 + }, + { + "epoch": 0.49, + "grad_norm": 1.1914150550391847, + "learning_rate": 1.085452110998943e-05, + "loss": 0.5651, + "step": 6298 + }, + { + "epoch": 0.49, + "grad_norm": 1.179156051066751, + "learning_rate": 1.0852017602608831e-05, + "loss": 0.5558, + "step": 6299 + }, + { + "epoch": 0.49, + "grad_norm": 1.1901141555110957, + "learning_rate": 1.084951404143596e-05, + "loss": 0.5582, + "step": 6300 + }, + { + "epoch": 0.49, + "grad_norm": 1.1525455621414233, + "learning_rate": 1.0847010426628888e-05, + "loss": 0.5806, + "step": 6301 + }, + { + "epoch": 0.49, + "grad_norm": 1.245375375345019, + "learning_rate": 1.0844506758345676e-05, + "loss": 0.5989, + "step": 6302 + }, + { + "epoch": 0.49, + "grad_norm": 1.2113059683073955, + "learning_rate": 1.0842003036744395e-05, + "loss": 0.605, + "step": 6303 + }, + { + "epoch": 0.49, + "grad_norm": 1.1326986321408101, + "learning_rate": 1.0839499261983119e-05, + "loss": 0.5538, + "step": 6304 + }, + { + "epoch": 0.49, + "grad_norm": 1.1612838762349662, + "learning_rate": 1.0836995434219924e-05, + "loss": 0.5823, + "step": 6305 + }, + { + "epoch": 0.49, + "grad_norm": 1.0554097845970336, + "learning_rate": 1.0834491553612889e-05, + "loss": 0.5296, + "step": 6306 + }, + { + "epoch": 0.49, + "grad_norm": 1.127822514185915, + "learning_rate": 1.0831987620320096e-05, + "loss": 0.5667, + "step": 6307 + }, + { + "epoch": 0.49, + "grad_norm": 1.2216793459762962, + "learning_rate": 1.0829483634499632e-05, + "loss": 0.6178, + "step": 6308 + }, + { + "epoch": 0.49, + "grad_norm": 1.0998509956356397, + "learning_rate": 1.0826979596309586e-05, + "loss": 0.5163, + "step": 6309 + }, + { + "epoch": 0.49, + "grad_norm": 1.2600095054479847, + "learning_rate": 1.082447550590805e-05, + "loss": 0.5673, + "step": 6310 + }, + { + "epoch": 0.49, + "grad_norm": 1.112431626951256, + "learning_rate": 1.0821971363453125e-05, + "loss": 0.5487, + "step": 6311 + }, + { + "epoch": 0.49, + "grad_norm": 1.133429418492436, + "learning_rate": 1.0819467169102906e-05, + "loss": 0.5242, + "step": 6312 + }, + { + "epoch": 0.49, + "grad_norm": 1.1067683799566865, + "learning_rate": 1.0816962923015495e-05, + "loss": 0.5654, + "step": 6313 + }, + { + "epoch": 0.49, + "grad_norm": 1.1814836382029623, + "learning_rate": 1.0814458625349002e-05, + "loss": 0.6146, + "step": 6314 + }, + { + "epoch": 0.49, + "grad_norm": 1.197842482329181, + "learning_rate": 1.0811954276261532e-05, + "loss": 0.5133, + "step": 6315 + }, + { + "epoch": 0.49, + "grad_norm": 1.2046544645655877, + "learning_rate": 1.08094498759112e-05, + "loss": 0.5578, + "step": 6316 + }, + { + "epoch": 0.49, + "grad_norm": 1.2002400078765783, + "learning_rate": 1.0806945424456124e-05, + "loss": 0.5531, + "step": 6317 + }, + { + "epoch": 0.49, + "grad_norm": 1.2547361770631573, + "learning_rate": 1.0804440922054415e-05, + "loss": 0.6205, + "step": 6318 + }, + { + "epoch": 0.49, + "grad_norm": 1.0131846526726394, + "learning_rate": 1.0801936368864203e-05, + "loss": 0.4707, + "step": 6319 + }, + { + "epoch": 0.49, + "grad_norm": 1.2048835782836915, + "learning_rate": 1.0799431765043609e-05, + "loss": 0.5665, + "step": 6320 + }, + { + "epoch": 0.49, + "grad_norm": 1.12954234645605, + "learning_rate": 1.0796927110750766e-05, + "loss": 0.5378, + "step": 6321 + }, + { + "epoch": 0.49, + "grad_norm": 1.1835233963304486, + "learning_rate": 1.0794422406143802e-05, + "loss": 0.5567, + "step": 6322 + }, + { + "epoch": 0.49, + "grad_norm": 1.1189268750219665, + "learning_rate": 1.079191765138085e-05, + "loss": 0.5382, + "step": 6323 + }, + { + "epoch": 0.49, + "grad_norm": 1.14656578697717, + "learning_rate": 1.0789412846620052e-05, + "loss": 0.561, + "step": 6324 + }, + { + "epoch": 0.49, + "grad_norm": 1.1947010944939058, + "learning_rate": 1.078690799201955e-05, + "loss": 0.5726, + "step": 6325 + }, + { + "epoch": 0.49, + "grad_norm": 1.1365450696890864, + "learning_rate": 1.0784403087737487e-05, + "loss": 0.5449, + "step": 6326 + }, + { + "epoch": 0.49, + "grad_norm": 1.1423639080370231, + "learning_rate": 1.078189813393201e-05, + "loss": 0.53, + "step": 6327 + }, + { + "epoch": 0.49, + "grad_norm": 1.0225251768785275, + "learning_rate": 1.0779393130761267e-05, + "loss": 0.5306, + "step": 6328 + }, + { + "epoch": 0.49, + "grad_norm": 1.118557230362184, + "learning_rate": 1.0776888078383415e-05, + "loss": 0.51, + "step": 6329 + }, + { + "epoch": 0.49, + "grad_norm": 1.182975532049757, + "learning_rate": 1.0774382976956613e-05, + "loss": 0.5317, + "step": 6330 + }, + { + "epoch": 0.49, + "grad_norm": 1.1382466433362692, + "learning_rate": 1.0771877826639016e-05, + "loss": 0.5587, + "step": 6331 + }, + { + "epoch": 0.49, + "grad_norm": 1.1963473558820212, + "learning_rate": 1.0769372627588792e-05, + "loss": 0.5636, + "step": 6332 + }, + { + "epoch": 0.49, + "grad_norm": 1.0906775235095851, + "learning_rate": 1.0766867379964101e-05, + "loss": 0.5241, + "step": 6333 + }, + { + "epoch": 0.49, + "grad_norm": 1.1583652736642014, + "learning_rate": 1.0764362083923117e-05, + "loss": 0.5167, + "step": 6334 + }, + { + "epoch": 0.49, + "grad_norm": 1.2013230143416829, + "learning_rate": 1.0761856739624012e-05, + "loss": 0.5582, + "step": 6335 + }, + { + "epoch": 0.49, + "grad_norm": 1.1879429743884447, + "learning_rate": 1.0759351347224961e-05, + "loss": 0.5507, + "step": 6336 + }, + { + "epoch": 0.49, + "grad_norm": 1.153090102745418, + "learning_rate": 1.0756845906884141e-05, + "loss": 0.5871, + "step": 6337 + }, + { + "epoch": 0.49, + "grad_norm": 1.1621570737650624, + "learning_rate": 1.0754340418759734e-05, + "loss": 0.556, + "step": 6338 + }, + { + "epoch": 0.49, + "grad_norm": 1.1537395752077297, + "learning_rate": 1.0751834883009922e-05, + "loss": 0.5413, + "step": 6339 + }, + { + "epoch": 0.49, + "grad_norm": 1.2374101818126393, + "learning_rate": 1.0749329299792898e-05, + "loss": 0.6011, + "step": 6340 + }, + { + "epoch": 0.49, + "grad_norm": 1.1537678340471695, + "learning_rate": 1.074682366926685e-05, + "loss": 0.5396, + "step": 6341 + }, + { + "epoch": 0.49, + "grad_norm": 1.1881184724149834, + "learning_rate": 1.074431799158997e-05, + "loss": 0.5643, + "step": 6342 + }, + { + "epoch": 0.49, + "grad_norm": 1.1101484154808001, + "learning_rate": 1.0741812266920453e-05, + "loss": 0.5376, + "step": 6343 + }, + { + "epoch": 0.49, + "grad_norm": 1.2491237907270207, + "learning_rate": 1.0739306495416502e-05, + "loss": 0.578, + "step": 6344 + }, + { + "epoch": 0.49, + "grad_norm": 1.1629287011291416, + "learning_rate": 1.0736800677236316e-05, + "loss": 0.5562, + "step": 6345 + }, + { + "epoch": 0.49, + "grad_norm": 1.1693825728529563, + "learning_rate": 1.0734294812538105e-05, + "loss": 0.601, + "step": 6346 + }, + { + "epoch": 0.49, + "grad_norm": 1.247201552688239, + "learning_rate": 1.0731788901480071e-05, + "loss": 0.5603, + "step": 6347 + }, + { + "epoch": 0.49, + "grad_norm": 1.1101095427339125, + "learning_rate": 1.072928294422043e-05, + "loss": 0.5114, + "step": 6348 + }, + { + "epoch": 0.49, + "grad_norm": 1.1596878596155982, + "learning_rate": 1.0726776940917391e-05, + "loss": 0.585, + "step": 6349 + }, + { + "epoch": 0.49, + "grad_norm": 1.1022584488792138, + "learning_rate": 1.0724270891729178e-05, + "loss": 0.5236, + "step": 6350 + }, + { + "epoch": 0.49, + "grad_norm": 1.0470160204291132, + "learning_rate": 1.0721764796814005e-05, + "loss": 0.4791, + "step": 6351 + }, + { + "epoch": 0.49, + "grad_norm": 1.113496297846422, + "learning_rate": 1.0719258656330095e-05, + "loss": 0.5885, + "step": 6352 + }, + { + "epoch": 0.49, + "grad_norm": 1.2683157868410164, + "learning_rate": 1.071675247043568e-05, + "loss": 0.6067, + "step": 6353 + }, + { + "epoch": 0.49, + "grad_norm": 1.1515656511323173, + "learning_rate": 1.0714246239288977e-05, + "loss": 0.5735, + "step": 6354 + }, + { + "epoch": 0.49, + "grad_norm": 1.187368988037953, + "learning_rate": 1.0711739963048229e-05, + "loss": 0.5395, + "step": 6355 + }, + { + "epoch": 0.49, + "grad_norm": 1.1191194275983907, + "learning_rate": 1.0709233641871663e-05, + "loss": 0.567, + "step": 6356 + }, + { + "epoch": 0.49, + "grad_norm": 1.2201148484844067, + "learning_rate": 1.0706727275917519e-05, + "loss": 0.583, + "step": 6357 + }, + { + "epoch": 0.49, + "grad_norm": 1.1183576524556678, + "learning_rate": 1.0704220865344036e-05, + "loss": 0.5268, + "step": 6358 + }, + { + "epoch": 0.49, + "grad_norm": 1.1074173538680858, + "learning_rate": 1.0701714410309454e-05, + "loss": 0.5301, + "step": 6359 + }, + { + "epoch": 0.49, + "grad_norm": 1.2581336517718718, + "learning_rate": 1.0699207910972022e-05, + "loss": 0.6065, + "step": 6360 + }, + { + "epoch": 0.49, + "grad_norm": 1.198044291898656, + "learning_rate": 1.069670136748999e-05, + "loss": 0.5631, + "step": 6361 + }, + { + "epoch": 0.49, + "grad_norm": 1.2029244887482289, + "learning_rate": 1.0694194780021603e-05, + "loss": 0.567, + "step": 6362 + }, + { + "epoch": 0.49, + "grad_norm": 1.1244606738611367, + "learning_rate": 1.069168814872512e-05, + "loss": 0.5642, + "step": 6363 + }, + { + "epoch": 0.49, + "grad_norm": 1.1542943964123136, + "learning_rate": 1.0689181473758793e-05, + "loss": 0.5415, + "step": 6364 + }, + { + "epoch": 0.49, + "grad_norm": 1.106997830916901, + "learning_rate": 1.0686674755280886e-05, + "loss": 0.5411, + "step": 6365 + }, + { + "epoch": 0.49, + "grad_norm": 1.2113052794102166, + "learning_rate": 1.068416799344966e-05, + "loss": 0.5413, + "step": 6366 + }, + { + "epoch": 0.49, + "grad_norm": 1.190706491560063, + "learning_rate": 1.0681661188423373e-05, + "loss": 0.5769, + "step": 6367 + }, + { + "epoch": 0.49, + "grad_norm": 1.1948511567253493, + "learning_rate": 1.0679154340360305e-05, + "loss": 0.5415, + "step": 6368 + }, + { + "epoch": 0.49, + "grad_norm": 1.2501847607441907, + "learning_rate": 1.0676647449418713e-05, + "loss": 0.5796, + "step": 6369 + }, + { + "epoch": 0.49, + "grad_norm": 1.1075798339141731, + "learning_rate": 1.067414051575688e-05, + "loss": 0.5408, + "step": 6370 + }, + { + "epoch": 0.49, + "grad_norm": 1.1636644745406046, + "learning_rate": 1.0671633539533082e-05, + "loss": 0.5405, + "step": 6371 + }, + { + "epoch": 0.49, + "grad_norm": 1.2107047072577046, + "learning_rate": 1.0669126520905588e-05, + "loss": 0.5397, + "step": 6372 + }, + { + "epoch": 0.49, + "grad_norm": 1.2012178737857797, + "learning_rate": 1.0666619460032688e-05, + "loss": 0.5712, + "step": 6373 + }, + { + "epoch": 0.49, + "grad_norm": 1.1286669942893288, + "learning_rate": 1.0664112357072658e-05, + "loss": 0.4973, + "step": 6374 + }, + { + "epoch": 0.49, + "grad_norm": 1.1936473842275321, + "learning_rate": 1.0661605212183791e-05, + "loss": 0.5289, + "step": 6375 + }, + { + "epoch": 0.49, + "grad_norm": 1.1477808729509356, + "learning_rate": 1.0659098025524374e-05, + "loss": 0.5972, + "step": 6376 + }, + { + "epoch": 0.49, + "grad_norm": 1.1864971142270393, + "learning_rate": 1.0656590797252697e-05, + "loss": 0.5544, + "step": 6377 + }, + { + "epoch": 0.49, + "grad_norm": 1.2993287921302266, + "learning_rate": 1.0654083527527056e-05, + "loss": 0.593, + "step": 6378 + }, + { + "epoch": 0.49, + "grad_norm": 1.2390407315010037, + "learning_rate": 1.0651576216505747e-05, + "loss": 0.5953, + "step": 6379 + }, + { + "epoch": 0.49, + "grad_norm": 1.2142161130993105, + "learning_rate": 1.0649068864347072e-05, + "loss": 0.5534, + "step": 6380 + }, + { + "epoch": 0.5, + "grad_norm": 1.2344725365069458, + "learning_rate": 1.0646561471209328e-05, + "loss": 0.5721, + "step": 6381 + }, + { + "epoch": 0.5, + "grad_norm": 1.2690621785922231, + "learning_rate": 1.0644054037250825e-05, + "loss": 0.577, + "step": 6382 + }, + { + "epoch": 0.5, + "grad_norm": 1.1976319790226606, + "learning_rate": 1.0641546562629865e-05, + "loss": 0.5779, + "step": 6383 + }, + { + "epoch": 0.5, + "grad_norm": 1.260962481056273, + "learning_rate": 1.0639039047504763e-05, + "loss": 0.5964, + "step": 6384 + }, + { + "epoch": 0.5, + "grad_norm": 1.1301798780290067, + "learning_rate": 1.0636531492033826e-05, + "loss": 0.5542, + "step": 6385 + }, + { + "epoch": 0.5, + "grad_norm": 1.2013236097315585, + "learning_rate": 1.0634023896375376e-05, + "loss": 0.5422, + "step": 6386 + }, + { + "epoch": 0.5, + "grad_norm": 1.219123245208269, + "learning_rate": 1.0631516260687722e-05, + "loss": 0.592, + "step": 6387 + }, + { + "epoch": 0.5, + "grad_norm": 1.3344785967239943, + "learning_rate": 1.062900858512919e-05, + "loss": 0.5912, + "step": 6388 + }, + { + "epoch": 0.5, + "grad_norm": 1.261559491100791, + "learning_rate": 1.06265008698581e-05, + "loss": 0.6231, + "step": 6389 + }, + { + "epoch": 0.5, + "grad_norm": 1.128106120789023, + "learning_rate": 1.0623993115032781e-05, + "loss": 0.5315, + "step": 6390 + }, + { + "epoch": 0.5, + "grad_norm": 1.1808282250558813, + "learning_rate": 1.0621485320811552e-05, + "loss": 0.5405, + "step": 6391 + }, + { + "epoch": 0.5, + "grad_norm": 1.1439660665574554, + "learning_rate": 1.0618977487352752e-05, + "loss": 0.5359, + "step": 6392 + }, + { + "epoch": 0.5, + "grad_norm": 1.2414900063071381, + "learning_rate": 1.061646961481471e-05, + "loss": 0.6137, + "step": 6393 + }, + { + "epoch": 0.5, + "grad_norm": 1.136545541682416, + "learning_rate": 1.0613961703355758e-05, + "loss": 0.5369, + "step": 6394 + }, + { + "epoch": 0.5, + "grad_norm": 1.221299414908821, + "learning_rate": 1.0611453753134237e-05, + "loss": 0.6023, + "step": 6395 + }, + { + "epoch": 0.5, + "grad_norm": 1.0786747843665214, + "learning_rate": 1.060894576430849e-05, + "loss": 0.5066, + "step": 6396 + }, + { + "epoch": 0.5, + "grad_norm": 1.2116762430493864, + "learning_rate": 1.0606437737036849e-05, + "loss": 0.6093, + "step": 6397 + }, + { + "epoch": 0.5, + "grad_norm": 1.2845438509769538, + "learning_rate": 1.0603929671477669e-05, + "loss": 0.6411, + "step": 6398 + }, + { + "epoch": 0.5, + "grad_norm": 1.1755855055182531, + "learning_rate": 1.0601421567789289e-05, + "loss": 0.5427, + "step": 6399 + }, + { + "epoch": 0.5, + "grad_norm": 1.14649404492435, + "learning_rate": 1.0598913426130067e-05, + "loss": 0.5598, + "step": 6400 + }, + { + "epoch": 0.5, + "grad_norm": 1.2899880525494083, + "learning_rate": 1.0596405246658348e-05, + "loss": 0.575, + "step": 6401 + }, + { + "epoch": 0.5, + "grad_norm": 1.112014317310567, + "learning_rate": 1.0593897029532487e-05, + "loss": 0.5334, + "step": 6402 + }, + { + "epoch": 0.5, + "grad_norm": 1.0481667591163069, + "learning_rate": 1.0591388774910847e-05, + "loss": 0.544, + "step": 6403 + }, + { + "epoch": 0.5, + "grad_norm": 1.1072609328666656, + "learning_rate": 1.0588880482951778e-05, + "loss": 0.5097, + "step": 6404 + }, + { + "epoch": 0.5, + "grad_norm": 1.1279984886686873, + "learning_rate": 1.0586372153813649e-05, + "loss": 0.574, + "step": 6405 + }, + { + "epoch": 0.5, + "grad_norm": 1.1469497905740156, + "learning_rate": 1.058386378765482e-05, + "loss": 0.5426, + "step": 6406 + }, + { + "epoch": 0.5, + "grad_norm": 1.216356617252466, + "learning_rate": 1.0581355384633655e-05, + "loss": 0.5958, + "step": 6407 + }, + { + "epoch": 0.5, + "grad_norm": 1.1625914527442378, + "learning_rate": 1.0578846944908528e-05, + "loss": 0.4933, + "step": 6408 + }, + { + "epoch": 0.5, + "grad_norm": 1.1409076706208394, + "learning_rate": 1.0576338468637805e-05, + "loss": 0.591, + "step": 6409 + }, + { + "epoch": 0.5, + "grad_norm": 1.1989841293799026, + "learning_rate": 1.0573829955979864e-05, + "loss": 0.5128, + "step": 6410 + }, + { + "epoch": 0.5, + "grad_norm": 1.2182205713084933, + "learning_rate": 1.0571321407093076e-05, + "loss": 0.6106, + "step": 6411 + }, + { + "epoch": 0.5, + "grad_norm": 1.5133196414770653, + "learning_rate": 1.0568812822135819e-05, + "loss": 0.5609, + "step": 6412 + }, + { + "epoch": 0.5, + "grad_norm": 1.1435371742894647, + "learning_rate": 1.0566304201266473e-05, + "loss": 0.5382, + "step": 6413 + }, + { + "epoch": 0.5, + "grad_norm": 1.212682488069999, + "learning_rate": 1.0563795544643422e-05, + "loss": 0.5654, + "step": 6414 + }, + { + "epoch": 0.5, + "grad_norm": 1.1751436044311612, + "learning_rate": 1.0561286852425052e-05, + "loss": 0.556, + "step": 6415 + }, + { + "epoch": 0.5, + "grad_norm": 1.297821159379831, + "learning_rate": 1.0558778124769747e-05, + "loss": 0.5435, + "step": 6416 + }, + { + "epoch": 0.5, + "grad_norm": 1.2000484516375873, + "learning_rate": 1.0556269361835891e-05, + "loss": 0.554, + "step": 6417 + }, + { + "epoch": 0.5, + "grad_norm": 1.1933590253713942, + "learning_rate": 1.0553760563781883e-05, + "loss": 0.5305, + "step": 6418 + }, + { + "epoch": 0.5, + "grad_norm": 1.1562032947901797, + "learning_rate": 1.0551251730766114e-05, + "loss": 0.5307, + "step": 6419 + }, + { + "epoch": 0.5, + "grad_norm": 1.3157086434131653, + "learning_rate": 1.054874286294698e-05, + "loss": 0.5819, + "step": 6420 + }, + { + "epoch": 0.5, + "grad_norm": 1.0745875609779298, + "learning_rate": 1.0546233960482876e-05, + "loss": 0.523, + "step": 6421 + }, + { + "epoch": 0.5, + "grad_norm": 1.101834419182634, + "learning_rate": 1.0543725023532205e-05, + "loss": 0.5474, + "step": 6422 + }, + { + "epoch": 0.5, + "grad_norm": 1.2312347933996854, + "learning_rate": 1.0541216052253366e-05, + "loss": 0.5225, + "step": 6423 + }, + { + "epoch": 0.5, + "grad_norm": 1.143481974647742, + "learning_rate": 1.0538707046804768e-05, + "loss": 0.5907, + "step": 6424 + }, + { + "epoch": 0.5, + "grad_norm": 1.3307627345838706, + "learning_rate": 1.0536198007344816e-05, + "loss": 0.5076, + "step": 6425 + }, + { + "epoch": 0.5, + "grad_norm": 1.1568975439610916, + "learning_rate": 1.0533688934031916e-05, + "loss": 0.5882, + "step": 6426 + }, + { + "epoch": 0.5, + "grad_norm": 1.1576403429377826, + "learning_rate": 1.0531179827024478e-05, + "loss": 0.5685, + "step": 6427 + }, + { + "epoch": 0.5, + "grad_norm": 1.322009694336566, + "learning_rate": 1.0528670686480918e-05, + "loss": 0.5823, + "step": 6428 + }, + { + "epoch": 0.5, + "grad_norm": 1.1246501060601453, + "learning_rate": 1.052616151255965e-05, + "loss": 0.5357, + "step": 6429 + }, + { + "epoch": 0.5, + "grad_norm": 1.1874247577320272, + "learning_rate": 1.0523652305419092e-05, + "loss": 0.5853, + "step": 6430 + }, + { + "epoch": 0.5, + "grad_norm": 1.2036285337522425, + "learning_rate": 1.0521143065217664e-05, + "loss": 0.6928, + "step": 6431 + }, + { + "epoch": 0.5, + "grad_norm": 1.0968589738071735, + "learning_rate": 1.0518633792113782e-05, + "loss": 0.5713, + "step": 6432 + }, + { + "epoch": 0.5, + "grad_norm": 1.1880031824647297, + "learning_rate": 1.051612448626587e-05, + "loss": 0.5931, + "step": 6433 + }, + { + "epoch": 0.5, + "grad_norm": 1.1100371090811751, + "learning_rate": 1.0513615147832364e-05, + "loss": 0.525, + "step": 6434 + }, + { + "epoch": 0.5, + "grad_norm": 1.1496380609581494, + "learning_rate": 1.051110577697168e-05, + "loss": 0.607, + "step": 6435 + }, + { + "epoch": 0.5, + "grad_norm": 1.0676011876875386, + "learning_rate": 1.050859637384225e-05, + "loss": 0.5838, + "step": 6436 + }, + { + "epoch": 0.5, + "grad_norm": 1.0737795955145297, + "learning_rate": 1.050608693860251e-05, + "loss": 0.496, + "step": 6437 + }, + { + "epoch": 0.5, + "grad_norm": 1.1503433191906365, + "learning_rate": 1.0503577471410889e-05, + "loss": 0.5445, + "step": 6438 + }, + { + "epoch": 0.5, + "grad_norm": 1.1423082342667716, + "learning_rate": 1.0501067972425824e-05, + "loss": 0.5432, + "step": 6439 + }, + { + "epoch": 0.5, + "grad_norm": 1.1628046087885573, + "learning_rate": 1.0498558441805753e-05, + "loss": 0.5342, + "step": 6440 + }, + { + "epoch": 0.5, + "grad_norm": 1.110299920296153, + "learning_rate": 1.0496048879709116e-05, + "loss": 0.5762, + "step": 6441 + }, + { + "epoch": 0.5, + "grad_norm": 1.175374261622373, + "learning_rate": 1.0493539286294352e-05, + "loss": 0.5589, + "step": 6442 + }, + { + "epoch": 0.5, + "grad_norm": 1.1793212827448147, + "learning_rate": 1.0491029661719907e-05, + "loss": 0.5008, + "step": 6443 + }, + { + "epoch": 0.5, + "grad_norm": 2.108561267016032, + "learning_rate": 1.0488520006144227e-05, + "loss": 0.5186, + "step": 6444 + }, + { + "epoch": 0.5, + "grad_norm": 1.2138178378789635, + "learning_rate": 1.0486010319725759e-05, + "loss": 0.6598, + "step": 6445 + }, + { + "epoch": 0.5, + "grad_norm": 1.1794840150876267, + "learning_rate": 1.0483500602622951e-05, + "loss": 0.5719, + "step": 6446 + }, + { + "epoch": 0.5, + "grad_norm": 1.1307855822480315, + "learning_rate": 1.0480990854994257e-05, + "loss": 0.542, + "step": 6447 + }, + { + "epoch": 0.5, + "grad_norm": 1.1702967888423044, + "learning_rate": 1.0478481076998127e-05, + "loss": 0.5754, + "step": 6448 + }, + { + "epoch": 0.5, + "grad_norm": 1.2183695957529324, + "learning_rate": 1.0475971268793019e-05, + "loss": 0.5705, + "step": 6449 + }, + { + "epoch": 0.5, + "grad_norm": 1.1231250503531536, + "learning_rate": 1.0473461430537388e-05, + "loss": 0.5615, + "step": 6450 + }, + { + "epoch": 0.5, + "grad_norm": 1.1772062974466484, + "learning_rate": 1.0470951562389695e-05, + "loss": 0.5459, + "step": 6451 + }, + { + "epoch": 0.5, + "grad_norm": 1.1594341869618447, + "learning_rate": 1.04684416645084e-05, + "loss": 0.5658, + "step": 6452 + }, + { + "epoch": 0.5, + "grad_norm": 1.1167754900747189, + "learning_rate": 1.0465931737051964e-05, + "loss": 0.5336, + "step": 6453 + }, + { + "epoch": 0.5, + "grad_norm": 1.1716011744699424, + "learning_rate": 1.0463421780178857e-05, + "loss": 0.5337, + "step": 6454 + }, + { + "epoch": 0.5, + "grad_norm": 1.2492031895644735, + "learning_rate": 1.0460911794047542e-05, + "loss": 0.5967, + "step": 6455 + }, + { + "epoch": 0.5, + "grad_norm": 1.158638832002312, + "learning_rate": 1.0458401778816482e-05, + "loss": 0.5402, + "step": 6456 + }, + { + "epoch": 0.5, + "grad_norm": 1.105887990653393, + "learning_rate": 1.0455891734644158e-05, + "loss": 0.5824, + "step": 6457 + }, + { + "epoch": 0.5, + "grad_norm": 1.2536555719994473, + "learning_rate": 1.0453381661689035e-05, + "loss": 0.5849, + "step": 6458 + }, + { + "epoch": 0.5, + "grad_norm": 1.2694543433736711, + "learning_rate": 1.045087156010959e-05, + "loss": 0.6392, + "step": 6459 + }, + { + "epoch": 0.5, + "grad_norm": 1.0417108272092714, + "learning_rate": 1.0448361430064296e-05, + "loss": 0.5086, + "step": 6460 + }, + { + "epoch": 0.5, + "grad_norm": 1.2602374472239826, + "learning_rate": 1.044585127171163e-05, + "loss": 0.5603, + "step": 6461 + }, + { + "epoch": 0.5, + "grad_norm": 1.1535407105278217, + "learning_rate": 1.0443341085210077e-05, + "loss": 0.535, + "step": 6462 + }, + { + "epoch": 0.5, + "grad_norm": 1.1858109458173658, + "learning_rate": 1.0440830870718108e-05, + "loss": 0.5779, + "step": 6463 + }, + { + "epoch": 0.5, + "grad_norm": 1.2357698599338334, + "learning_rate": 1.0438320628394219e-05, + "loss": 0.4938, + "step": 6464 + }, + { + "epoch": 0.5, + "grad_norm": 1.1479697280408172, + "learning_rate": 1.0435810358396882e-05, + "loss": 0.5555, + "step": 6465 + }, + { + "epoch": 0.5, + "grad_norm": 1.1795676972648603, + "learning_rate": 1.043330006088459e-05, + "loss": 0.5988, + "step": 6466 + }, + { + "epoch": 0.5, + "grad_norm": 1.2177928687459614, + "learning_rate": 1.0430789736015829e-05, + "loss": 0.6048, + "step": 6467 + }, + { + "epoch": 0.5, + "grad_norm": 1.1370087872973356, + "learning_rate": 1.0428279383949089e-05, + "loss": 0.5043, + "step": 6468 + }, + { + "epoch": 0.5, + "grad_norm": 1.164688454093573, + "learning_rate": 1.0425769004842865e-05, + "loss": 0.5256, + "step": 6469 + }, + { + "epoch": 0.5, + "grad_norm": 1.2612197408836483, + "learning_rate": 1.0423258598855645e-05, + "loss": 0.5819, + "step": 6470 + }, + { + "epoch": 0.5, + "grad_norm": 1.1981013058989534, + "learning_rate": 1.0420748166145926e-05, + "loss": 0.6174, + "step": 6471 + }, + { + "epoch": 0.5, + "grad_norm": 1.3651490191625855, + "learning_rate": 1.0418237706872206e-05, + "loss": 0.596, + "step": 6472 + }, + { + "epoch": 0.5, + "grad_norm": 1.0885770265716346, + "learning_rate": 1.0415727221192977e-05, + "loss": 0.5212, + "step": 6473 + }, + { + "epoch": 0.5, + "grad_norm": 1.0616219202030865, + "learning_rate": 1.041321670926675e-05, + "loss": 0.4994, + "step": 6474 + }, + { + "epoch": 0.5, + "grad_norm": 1.1637901141078357, + "learning_rate": 1.0410706171252017e-05, + "loss": 0.5353, + "step": 6475 + }, + { + "epoch": 0.5, + "grad_norm": 1.2454489352011935, + "learning_rate": 1.0408195607307283e-05, + "loss": 0.559, + "step": 6476 + }, + { + "epoch": 0.5, + "grad_norm": 1.1409057376215426, + "learning_rate": 1.0405685017591057e-05, + "loss": 0.5341, + "step": 6477 + }, + { + "epoch": 0.5, + "grad_norm": 1.17630221462805, + "learning_rate": 1.040317440226184e-05, + "loss": 0.5392, + "step": 6478 + }, + { + "epoch": 0.5, + "grad_norm": 1.2302056409702453, + "learning_rate": 1.0400663761478145e-05, + "loss": 0.5698, + "step": 6479 + }, + { + "epoch": 0.5, + "grad_norm": 1.1188492587080476, + "learning_rate": 1.039815309539848e-05, + "loss": 0.5897, + "step": 6480 + }, + { + "epoch": 0.5, + "grad_norm": 1.1265768021139977, + "learning_rate": 1.0395642404181355e-05, + "loss": 0.5319, + "step": 6481 + }, + { + "epoch": 0.5, + "grad_norm": 1.2868430711621264, + "learning_rate": 1.0393131687985283e-05, + "loss": 0.5368, + "step": 6482 + }, + { + "epoch": 0.5, + "grad_norm": 1.2184903773075373, + "learning_rate": 1.039062094696878e-05, + "loss": 0.5443, + "step": 6483 + }, + { + "epoch": 0.5, + "grad_norm": 1.1159259101823455, + "learning_rate": 1.038811018129036e-05, + "loss": 0.4983, + "step": 6484 + }, + { + "epoch": 0.5, + "grad_norm": 1.1391438312562574, + "learning_rate": 1.0385599391108546e-05, + "loss": 0.5673, + "step": 6485 + }, + { + "epoch": 0.5, + "grad_norm": 1.1781463661583662, + "learning_rate": 1.0383088576581847e-05, + "loss": 0.5897, + "step": 6486 + }, + { + "epoch": 0.5, + "grad_norm": 1.1682227633546378, + "learning_rate": 1.0380577737868795e-05, + "loss": 0.5534, + "step": 6487 + }, + { + "epoch": 0.5, + "grad_norm": 1.0834568699067162, + "learning_rate": 1.0378066875127904e-05, + "loss": 0.5223, + "step": 6488 + }, + { + "epoch": 0.5, + "grad_norm": 1.211989997495161, + "learning_rate": 1.03755559885177e-05, + "loss": 0.5408, + "step": 6489 + }, + { + "epoch": 0.5, + "grad_norm": 1.2030624274293904, + "learning_rate": 1.0373045078196713e-05, + "loss": 0.5529, + "step": 6490 + }, + { + "epoch": 0.5, + "grad_norm": 1.1346852574474686, + "learning_rate": 1.037053414432346e-05, + "loss": 0.5373, + "step": 6491 + }, + { + "epoch": 0.5, + "grad_norm": 0.9813056504814902, + "learning_rate": 1.0368023187056477e-05, + "loss": 0.4685, + "step": 6492 + }, + { + "epoch": 0.5, + "grad_norm": 1.224564303552805, + "learning_rate": 1.0365512206554294e-05, + "loss": 0.5302, + "step": 6493 + }, + { + "epoch": 0.5, + "grad_norm": 1.2173315254374182, + "learning_rate": 1.0363001202975439e-05, + "loss": 0.5652, + "step": 6494 + }, + { + "epoch": 0.5, + "grad_norm": 1.1619547765142086, + "learning_rate": 1.0360490176478443e-05, + "loss": 0.5281, + "step": 6495 + }, + { + "epoch": 0.5, + "grad_norm": 1.3244909046216498, + "learning_rate": 1.0357979127221842e-05, + "loss": 0.618, + "step": 6496 + }, + { + "epoch": 0.5, + "grad_norm": 1.1800158214316623, + "learning_rate": 1.0355468055364171e-05, + "loss": 0.5515, + "step": 6497 + }, + { + "epoch": 0.5, + "grad_norm": 1.1716459940628152, + "learning_rate": 1.0352956961063972e-05, + "loss": 0.5988, + "step": 6498 + }, + { + "epoch": 0.5, + "grad_norm": 1.2110682755272661, + "learning_rate": 1.0350445844479775e-05, + "loss": 0.5478, + "step": 6499 + }, + { + "epoch": 0.5, + "grad_norm": 1.2359333100953198, + "learning_rate": 1.0347934705770126e-05, + "loss": 0.6416, + "step": 6500 + }, + { + "epoch": 0.5, + "grad_norm": 1.0897027474800642, + "learning_rate": 1.0345423545093563e-05, + "loss": 0.5335, + "step": 6501 + }, + { + "epoch": 0.5, + "grad_norm": 1.228188135984859, + "learning_rate": 1.0342912362608628e-05, + "loss": 0.5646, + "step": 6502 + }, + { + "epoch": 0.5, + "grad_norm": 1.122441667770039, + "learning_rate": 1.0340401158473869e-05, + "loss": 0.5241, + "step": 6503 + }, + { + "epoch": 0.5, + "grad_norm": 1.106340314120338, + "learning_rate": 1.0337889932847828e-05, + "loss": 0.5667, + "step": 6504 + }, + { + "epoch": 0.5, + "grad_norm": 1.1356259366054169, + "learning_rate": 1.0335378685889053e-05, + "loss": 0.5444, + "step": 6505 + }, + { + "epoch": 0.5, + "grad_norm": 1.2017887671328018, + "learning_rate": 1.033286741775609e-05, + "loss": 0.537, + "step": 6506 + }, + { + "epoch": 0.5, + "grad_norm": 1.2231736231102452, + "learning_rate": 1.0330356128607489e-05, + "loss": 0.5794, + "step": 6507 + }, + { + "epoch": 0.5, + "grad_norm": 1.1158043361957681, + "learning_rate": 1.0327844818601802e-05, + "loss": 0.5099, + "step": 6508 + }, + { + "epoch": 0.5, + "grad_norm": 1.1514681835706855, + "learning_rate": 1.0325333487897579e-05, + "loss": 0.5287, + "step": 6509 + }, + { + "epoch": 0.51, + "grad_norm": 1.2631800076050756, + "learning_rate": 1.0322822136653376e-05, + "loss": 0.5816, + "step": 6510 + }, + { + "epoch": 0.51, + "grad_norm": 1.2516436737444387, + "learning_rate": 1.0320310765027746e-05, + "loss": 0.5551, + "step": 6511 + }, + { + "epoch": 0.51, + "grad_norm": 1.0623838697808952, + "learning_rate": 1.0317799373179242e-05, + "loss": 0.5489, + "step": 6512 + }, + { + "epoch": 0.51, + "grad_norm": 1.1943054444856593, + "learning_rate": 1.0315287961266427e-05, + "loss": 0.6022, + "step": 6513 + }, + { + "epoch": 0.51, + "grad_norm": 1.1144732067332557, + "learning_rate": 1.0312776529447857e-05, + "loss": 0.5251, + "step": 6514 + }, + { + "epoch": 0.51, + "grad_norm": 1.0348766021144762, + "learning_rate": 1.031026507788209e-05, + "loss": 0.5296, + "step": 6515 + }, + { + "epoch": 0.51, + "grad_norm": 1.0781295333988135, + "learning_rate": 1.0307753606727685e-05, + "loss": 0.5198, + "step": 6516 + }, + { + "epoch": 0.51, + "grad_norm": 1.2003042669791053, + "learning_rate": 1.0305242116143209e-05, + "loss": 0.537, + "step": 6517 + }, + { + "epoch": 0.51, + "grad_norm": 1.1016393864630574, + "learning_rate": 1.0302730606287226e-05, + "loss": 0.5368, + "step": 6518 + }, + { + "epoch": 0.51, + "grad_norm": 1.153430594292088, + "learning_rate": 1.0300219077318294e-05, + "loss": 0.5304, + "step": 6519 + }, + { + "epoch": 0.51, + "grad_norm": 1.2458989580066322, + "learning_rate": 1.0297707529394984e-05, + "loss": 0.5492, + "step": 6520 + }, + { + "epoch": 0.51, + "grad_norm": 1.1302176913153803, + "learning_rate": 1.0295195962675864e-05, + "loss": 0.5189, + "step": 6521 + }, + { + "epoch": 0.51, + "grad_norm": 1.1454132639442505, + "learning_rate": 1.0292684377319495e-05, + "loss": 0.5302, + "step": 6522 + }, + { + "epoch": 0.51, + "grad_norm": 1.1407558940874796, + "learning_rate": 1.0290172773484455e-05, + "loss": 0.5018, + "step": 6523 + }, + { + "epoch": 0.51, + "grad_norm": 1.0986633567802637, + "learning_rate": 1.0287661151329312e-05, + "loss": 0.5305, + "step": 6524 + }, + { + "epoch": 0.51, + "grad_norm": 1.069648369561804, + "learning_rate": 1.0285149511012632e-05, + "loss": 0.5605, + "step": 6525 + }, + { + "epoch": 0.51, + "grad_norm": 1.2477564705169022, + "learning_rate": 1.0282637852692996e-05, + "loss": 0.5725, + "step": 6526 + }, + { + "epoch": 0.51, + "grad_norm": 1.2772713736931434, + "learning_rate": 1.028012617652897e-05, + "loss": 0.5525, + "step": 6527 + }, + { + "epoch": 0.51, + "grad_norm": 1.1024198509891407, + "learning_rate": 1.0277614482679136e-05, + "loss": 0.5512, + "step": 6528 + }, + { + "epoch": 0.51, + "grad_norm": 1.1801470436192658, + "learning_rate": 1.027510277130207e-05, + "loss": 0.5535, + "step": 6529 + }, + { + "epoch": 0.51, + "grad_norm": 1.1889222813063238, + "learning_rate": 1.027259104255634e-05, + "loss": 0.5709, + "step": 6530 + }, + { + "epoch": 0.51, + "grad_norm": 1.1782939334435116, + "learning_rate": 1.027007929660054e-05, + "loss": 0.5268, + "step": 6531 + }, + { + "epoch": 0.51, + "grad_norm": 1.1985508055758203, + "learning_rate": 1.0267567533593233e-05, + "loss": 0.5682, + "step": 6532 + }, + { + "epoch": 0.51, + "grad_norm": 1.0923754092476652, + "learning_rate": 1.026505575369301e-05, + "loss": 0.4851, + "step": 6533 + }, + { + "epoch": 0.51, + "grad_norm": 0.9899474862649464, + "learning_rate": 1.0262543957058451e-05, + "loss": 0.5371, + "step": 6534 + }, + { + "epoch": 0.51, + "grad_norm": 1.1415644917861016, + "learning_rate": 1.0260032143848134e-05, + "loss": 0.5193, + "step": 6535 + }, + { + "epoch": 0.51, + "grad_norm": 1.1370661358751857, + "learning_rate": 1.0257520314220647e-05, + "loss": 0.54, + "step": 6536 + }, + { + "epoch": 0.51, + "grad_norm": 1.0558569182361355, + "learning_rate": 1.0255008468334575e-05, + "loss": 0.5174, + "step": 6537 + }, + { + "epoch": 0.51, + "grad_norm": 1.1331114801945867, + "learning_rate": 1.0252496606348503e-05, + "loss": 0.5731, + "step": 6538 + }, + { + "epoch": 0.51, + "grad_norm": 1.216038009579673, + "learning_rate": 1.024998472842102e-05, + "loss": 0.5394, + "step": 6539 + }, + { + "epoch": 0.51, + "grad_norm": 1.0205418047194605, + "learning_rate": 1.0247472834710708e-05, + "loss": 0.5132, + "step": 6540 + }, + { + "epoch": 0.51, + "grad_norm": 1.1261061422619703, + "learning_rate": 1.024496092537616e-05, + "loss": 0.518, + "step": 6541 + }, + { + "epoch": 0.51, + "grad_norm": 1.2768046790209342, + "learning_rate": 1.0242449000575963e-05, + "loss": 0.6223, + "step": 6542 + }, + { + "epoch": 0.51, + "grad_norm": 1.143730638684103, + "learning_rate": 1.0239937060468713e-05, + "loss": 0.5265, + "step": 6543 + }, + { + "epoch": 0.51, + "grad_norm": 1.105533126196717, + "learning_rate": 1.0237425105212996e-05, + "loss": 0.5309, + "step": 6544 + }, + { + "epoch": 0.51, + "grad_norm": 1.1376822147515908, + "learning_rate": 1.0234913134967409e-05, + "loss": 0.5072, + "step": 6545 + }, + { + "epoch": 0.51, + "grad_norm": 1.3780283956771728, + "learning_rate": 1.0232401149890544e-05, + "loss": 0.6105, + "step": 6546 + }, + { + "epoch": 0.51, + "grad_norm": 1.1719564282419253, + "learning_rate": 1.0229889150140991e-05, + "loss": 0.5812, + "step": 6547 + }, + { + "epoch": 0.51, + "grad_norm": 1.1801045167024367, + "learning_rate": 1.0227377135877354e-05, + "loss": 0.6039, + "step": 6548 + }, + { + "epoch": 0.51, + "grad_norm": 1.3057707441253552, + "learning_rate": 1.0224865107258225e-05, + "loss": 0.5544, + "step": 6549 + }, + { + "epoch": 0.51, + "grad_norm": 1.206555615385461, + "learning_rate": 1.0222353064442201e-05, + "loss": 0.5313, + "step": 6550 + }, + { + "epoch": 0.51, + "grad_norm": 1.1341920016726996, + "learning_rate": 1.0219841007587881e-05, + "loss": 0.4894, + "step": 6551 + }, + { + "epoch": 0.51, + "grad_norm": 1.1782227574272128, + "learning_rate": 1.0217328936853864e-05, + "loss": 0.5468, + "step": 6552 + }, + { + "epoch": 0.51, + "grad_norm": 1.0071137245799828, + "learning_rate": 1.021481685239875e-05, + "loss": 0.5132, + "step": 6553 + }, + { + "epoch": 0.51, + "grad_norm": 1.2673552189287915, + "learning_rate": 1.0212304754381139e-05, + "loss": 0.5952, + "step": 6554 + }, + { + "epoch": 0.51, + "grad_norm": 1.3368441087632277, + "learning_rate": 1.0209792642959634e-05, + "loss": 0.5348, + "step": 6555 + }, + { + "epoch": 0.51, + "grad_norm": 1.0995009048652482, + "learning_rate": 1.0207280518292837e-05, + "loss": 0.5244, + "step": 6556 + }, + { + "epoch": 0.51, + "grad_norm": 1.1372245371197143, + "learning_rate": 1.0204768380539352e-05, + "loss": 0.5116, + "step": 6557 + }, + { + "epoch": 0.51, + "grad_norm": 1.155992015203863, + "learning_rate": 1.0202256229857781e-05, + "loss": 0.4794, + "step": 6558 + }, + { + "epoch": 0.51, + "grad_norm": 1.254772184371548, + "learning_rate": 1.0199744066406735e-05, + "loss": 0.5829, + "step": 6559 + }, + { + "epoch": 0.51, + "grad_norm": 1.09942245927465, + "learning_rate": 1.0197231890344809e-05, + "loss": 0.544, + "step": 6560 + }, + { + "epoch": 0.51, + "grad_norm": 1.2019369526575665, + "learning_rate": 1.0194719701830622e-05, + "loss": 0.5544, + "step": 6561 + }, + { + "epoch": 0.51, + "grad_norm": 1.126106724490322, + "learning_rate": 1.0192207501022771e-05, + "loss": 0.5929, + "step": 6562 + }, + { + "epoch": 0.51, + "grad_norm": 1.2013904899826602, + "learning_rate": 1.0189695288079873e-05, + "loss": 0.5481, + "step": 6563 + }, + { + "epoch": 0.51, + "grad_norm": 1.2206449693178236, + "learning_rate": 1.0187183063160531e-05, + "loss": 0.6079, + "step": 6564 + }, + { + "epoch": 0.51, + "grad_norm": 1.114337941664481, + "learning_rate": 1.0184670826423355e-05, + "loss": 0.5575, + "step": 6565 + }, + { + "epoch": 0.51, + "grad_norm": 1.2035699987463646, + "learning_rate": 1.018215857802696e-05, + "loss": 0.5743, + "step": 6566 + }, + { + "epoch": 0.51, + "grad_norm": 1.1147611185098123, + "learning_rate": 1.017964631812995e-05, + "loss": 0.552, + "step": 6567 + }, + { + "epoch": 0.51, + "grad_norm": 1.2153862726131532, + "learning_rate": 1.0177134046890944e-05, + "loss": 0.5884, + "step": 6568 + }, + { + "epoch": 0.51, + "grad_norm": 1.1977840626617977, + "learning_rate": 1.0174621764468553e-05, + "loss": 0.5469, + "step": 6569 + }, + { + "epoch": 0.51, + "grad_norm": 1.0818145204384482, + "learning_rate": 1.0172109471021385e-05, + "loss": 0.5288, + "step": 6570 + }, + { + "epoch": 0.51, + "grad_norm": 1.2089178655568527, + "learning_rate": 1.0169597166708061e-05, + "loss": 0.5069, + "step": 6571 + }, + { + "epoch": 0.51, + "grad_norm": 1.1449184348693413, + "learning_rate": 1.0167084851687193e-05, + "loss": 0.5405, + "step": 6572 + }, + { + "epoch": 0.51, + "grad_norm": 1.3064836484759563, + "learning_rate": 1.0164572526117396e-05, + "loss": 0.5443, + "step": 6573 + }, + { + "epoch": 0.51, + "grad_norm": 1.2010027014219982, + "learning_rate": 1.0162060190157285e-05, + "loss": 0.5643, + "step": 6574 + }, + { + "epoch": 0.51, + "grad_norm": 1.2232356054697529, + "learning_rate": 1.015954784396548e-05, + "loss": 0.5998, + "step": 6575 + }, + { + "epoch": 0.51, + "grad_norm": 1.2996876102859292, + "learning_rate": 1.0157035487700592e-05, + "loss": 0.6302, + "step": 6576 + }, + { + "epoch": 0.51, + "grad_norm": 1.1514674588739016, + "learning_rate": 1.0154523121521249e-05, + "loss": 0.5674, + "step": 6577 + }, + { + "epoch": 0.51, + "grad_norm": 1.0995095785330056, + "learning_rate": 1.0152010745586064e-05, + "loss": 0.5496, + "step": 6578 + }, + { + "epoch": 0.51, + "grad_norm": 1.1704674978457041, + "learning_rate": 1.0149498360053656e-05, + "loss": 0.5491, + "step": 6579 + }, + { + "epoch": 0.51, + "grad_norm": 1.2343202892685265, + "learning_rate": 1.014698596508264e-05, + "loss": 0.5643, + "step": 6580 + }, + { + "epoch": 0.51, + "grad_norm": 1.1821688393580443, + "learning_rate": 1.0144473560831645e-05, + "loss": 0.5698, + "step": 6581 + }, + { + "epoch": 0.51, + "grad_norm": 1.1871477407699609, + "learning_rate": 1.0141961147459289e-05, + "loss": 0.5708, + "step": 6582 + }, + { + "epoch": 0.51, + "grad_norm": 1.1251248184473497, + "learning_rate": 1.0139448725124194e-05, + "loss": 0.5394, + "step": 6583 + }, + { + "epoch": 0.51, + "grad_norm": 1.2159984044547656, + "learning_rate": 1.0136936293984983e-05, + "loss": 0.5143, + "step": 6584 + }, + { + "epoch": 0.51, + "grad_norm": 1.1610665394676354, + "learning_rate": 1.0134423854200274e-05, + "loss": 0.5475, + "step": 6585 + }, + { + "epoch": 0.51, + "grad_norm": 1.0504687103266939, + "learning_rate": 1.0131911405928694e-05, + "loss": 0.4593, + "step": 6586 + }, + { + "epoch": 0.51, + "grad_norm": 1.0737200880047597, + "learning_rate": 1.0129398949328868e-05, + "loss": 0.5123, + "step": 6587 + }, + { + "epoch": 0.51, + "grad_norm": 1.230050927325156, + "learning_rate": 1.0126886484559417e-05, + "loss": 0.5036, + "step": 6588 + }, + { + "epoch": 0.51, + "grad_norm": 1.101732011021924, + "learning_rate": 1.012437401177897e-05, + "loss": 0.5352, + "step": 6589 + }, + { + "epoch": 0.51, + "grad_norm": 1.1523324222331206, + "learning_rate": 1.0121861531146147e-05, + "loss": 0.5841, + "step": 6590 + }, + { + "epoch": 0.51, + "grad_norm": 1.2120752219755637, + "learning_rate": 1.0119349042819578e-05, + "loss": 0.5782, + "step": 6591 + }, + { + "epoch": 0.51, + "grad_norm": 1.2562836542888172, + "learning_rate": 1.0116836546957891e-05, + "loss": 0.5675, + "step": 6592 + }, + { + "epoch": 0.51, + "grad_norm": 1.190674203571873, + "learning_rate": 1.0114324043719705e-05, + "loss": 0.5856, + "step": 6593 + }, + { + "epoch": 0.51, + "grad_norm": 1.1906225910312582, + "learning_rate": 1.0111811533263656e-05, + "loss": 0.5644, + "step": 6594 + }, + { + "epoch": 0.51, + "grad_norm": 1.2795944217278874, + "learning_rate": 1.0109299015748364e-05, + "loss": 0.5661, + "step": 6595 + }, + { + "epoch": 0.51, + "grad_norm": 1.3449736944980009, + "learning_rate": 1.0106786491332464e-05, + "loss": 0.5914, + "step": 6596 + }, + { + "epoch": 0.51, + "grad_norm": 1.2152631225913308, + "learning_rate": 1.0104273960174584e-05, + "loss": 0.5965, + "step": 6597 + }, + { + "epoch": 0.51, + "grad_norm": 1.1265948964194838, + "learning_rate": 1.0101761422433348e-05, + "loss": 0.5187, + "step": 6598 + }, + { + "epoch": 0.51, + "grad_norm": 1.2651839253000516, + "learning_rate": 1.0099248878267387e-05, + "loss": 0.5677, + "step": 6599 + }, + { + "epoch": 0.51, + "grad_norm": 1.195605971269191, + "learning_rate": 1.0096736327835335e-05, + "loss": 0.5369, + "step": 6600 + }, + { + "epoch": 0.51, + "grad_norm": 1.1175418105187145, + "learning_rate": 1.0094223771295817e-05, + "loss": 0.5727, + "step": 6601 + }, + { + "epoch": 0.51, + "grad_norm": 1.2082615973565014, + "learning_rate": 1.0091711208807471e-05, + "loss": 0.5648, + "step": 6602 + }, + { + "epoch": 0.51, + "grad_norm": 1.1560729638736833, + "learning_rate": 1.008919864052892e-05, + "loss": 0.5104, + "step": 6603 + }, + { + "epoch": 0.51, + "grad_norm": 1.1925774992545393, + "learning_rate": 1.0086686066618795e-05, + "loss": 0.5291, + "step": 6604 + }, + { + "epoch": 0.51, + "grad_norm": 1.1949278268606622, + "learning_rate": 1.0084173487235737e-05, + "loss": 0.6123, + "step": 6605 + }, + { + "epoch": 0.51, + "grad_norm": 1.095876588197386, + "learning_rate": 1.008166090253837e-05, + "loss": 0.5442, + "step": 6606 + }, + { + "epoch": 0.51, + "grad_norm": 1.2621101268410695, + "learning_rate": 1.0079148312685334e-05, + "loss": 0.5848, + "step": 6607 + }, + { + "epoch": 0.51, + "grad_norm": 1.172862997007255, + "learning_rate": 1.007663571783525e-05, + "loss": 0.5713, + "step": 6608 + }, + { + "epoch": 0.51, + "grad_norm": 1.1865492076194548, + "learning_rate": 1.007412311814676e-05, + "loss": 0.5255, + "step": 6609 + }, + { + "epoch": 0.51, + "grad_norm": 1.195853815113784, + "learning_rate": 1.0071610513778494e-05, + "loss": 0.5771, + "step": 6610 + }, + { + "epoch": 0.51, + "grad_norm": 1.0966509359230607, + "learning_rate": 1.0069097904889087e-05, + "loss": 0.5368, + "step": 6611 + }, + { + "epoch": 0.51, + "grad_norm": 1.1946093916049472, + "learning_rate": 1.0066585291637176e-05, + "loss": 0.6046, + "step": 6612 + }, + { + "epoch": 0.51, + "grad_norm": 1.1394684558735477, + "learning_rate": 1.006407267418139e-05, + "loss": 0.5404, + "step": 6613 + }, + { + "epoch": 0.51, + "grad_norm": 1.1732227967134585, + "learning_rate": 1.0061560052680363e-05, + "loss": 0.5562, + "step": 6614 + }, + { + "epoch": 0.51, + "grad_norm": 1.1894899559895769, + "learning_rate": 1.0059047427292736e-05, + "loss": 0.5676, + "step": 6615 + }, + { + "epoch": 0.51, + "grad_norm": 1.2033830836535635, + "learning_rate": 1.0056534798177138e-05, + "loss": 0.5798, + "step": 6616 + }, + { + "epoch": 0.51, + "grad_norm": 1.2348429118958113, + "learning_rate": 1.005402216549221e-05, + "loss": 0.5515, + "step": 6617 + }, + { + "epoch": 0.51, + "grad_norm": 1.103960173869838, + "learning_rate": 1.005150952939658e-05, + "loss": 0.5235, + "step": 6618 + }, + { + "epoch": 0.51, + "grad_norm": 1.1810135619881144, + "learning_rate": 1.0048996890048886e-05, + "loss": 0.527, + "step": 6619 + }, + { + "epoch": 0.51, + "grad_norm": 1.0970790336361398, + "learning_rate": 1.004648424760777e-05, + "loss": 0.4899, + "step": 6620 + }, + { + "epoch": 0.51, + "grad_norm": 1.1840508211092986, + "learning_rate": 1.0043971602231862e-05, + "loss": 0.5785, + "step": 6621 + }, + { + "epoch": 0.51, + "grad_norm": 1.1611851198367176, + "learning_rate": 1.0041458954079801e-05, + "loss": 0.5741, + "step": 6622 + }, + { + "epoch": 0.51, + "grad_norm": 1.105816358499722, + "learning_rate": 1.003894630331022e-05, + "loss": 0.5021, + "step": 6623 + }, + { + "epoch": 0.51, + "grad_norm": 1.1876195546250112, + "learning_rate": 1.0036433650081759e-05, + "loss": 0.518, + "step": 6624 + }, + { + "epoch": 0.51, + "grad_norm": 1.235508699939931, + "learning_rate": 1.0033920994553054e-05, + "loss": 0.5556, + "step": 6625 + }, + { + "epoch": 0.51, + "grad_norm": 1.1087737267330544, + "learning_rate": 1.003140833688274e-05, + "loss": 0.4528, + "step": 6626 + }, + { + "epoch": 0.51, + "grad_norm": 1.1096194897809544, + "learning_rate": 1.0028895677229458e-05, + "loss": 0.5449, + "step": 6627 + }, + { + "epoch": 0.51, + "grad_norm": 1.111613634170427, + "learning_rate": 1.0026383015751844e-05, + "loss": 0.5611, + "step": 6628 + }, + { + "epoch": 0.51, + "grad_norm": 1.209598071203132, + "learning_rate": 1.0023870352608529e-05, + "loss": 0.5912, + "step": 6629 + }, + { + "epoch": 0.51, + "grad_norm": 1.1306951268485905, + "learning_rate": 1.002135768795816e-05, + "loss": 0.4876, + "step": 6630 + }, + { + "epoch": 0.51, + "grad_norm": 1.1687027273970612, + "learning_rate": 1.0018845021959368e-05, + "loss": 0.5534, + "step": 6631 + }, + { + "epoch": 0.51, + "grad_norm": 1.1685012573085252, + "learning_rate": 1.0016332354770792e-05, + "loss": 0.5429, + "step": 6632 + }, + { + "epoch": 0.51, + "grad_norm": 1.1147397843598823, + "learning_rate": 1.0013819686551074e-05, + "loss": 0.5024, + "step": 6633 + }, + { + "epoch": 0.51, + "grad_norm": 1.1629865141043234, + "learning_rate": 1.0011307017458843e-05, + "loss": 0.5447, + "step": 6634 + }, + { + "epoch": 0.51, + "grad_norm": 1.1114728193164345, + "learning_rate": 1.0008794347652744e-05, + "loss": 0.5397, + "step": 6635 + }, + { + "epoch": 0.51, + "grad_norm": 1.0910426282972538, + "learning_rate": 1.0006281677291411e-05, + "loss": 0.5077, + "step": 6636 + }, + { + "epoch": 0.51, + "grad_norm": 1.199703106312897, + "learning_rate": 1.0003769006533486e-05, + "loss": 0.5656, + "step": 6637 + }, + { + "epoch": 0.51, + "grad_norm": 1.1136969074819794, + "learning_rate": 1.0001256335537604e-05, + "loss": 0.5386, + "step": 6638 + }, + { + "epoch": 0.52, + "grad_norm": 1.123661887901538, + "learning_rate": 9.998743664462401e-06, + "loss": 0.5245, + "step": 6639 + }, + { + "epoch": 0.52, + "grad_norm": 1.1360052956941233, + "learning_rate": 9.996230993466517e-06, + "loss": 0.5185, + "step": 6640 + }, + { + "epoch": 0.52, + "grad_norm": 1.1896329095578912, + "learning_rate": 9.993718322708592e-06, + "loss": 0.5385, + "step": 6641 + }, + { + "epoch": 0.52, + "grad_norm": 1.1937608808251252, + "learning_rate": 9.99120565234726e-06, + "loss": 0.5775, + "step": 6642 + }, + { + "epoch": 0.52, + "grad_norm": 1.1417846532866414, + "learning_rate": 9.988692982541159e-06, + "loss": 0.5625, + "step": 6643 + }, + { + "epoch": 0.52, + "grad_norm": 1.2271778482631197, + "learning_rate": 9.986180313448933e-06, + "loss": 0.558, + "step": 6644 + }, + { + "epoch": 0.52, + "grad_norm": 1.1966580067235342, + "learning_rate": 9.98366764522921e-06, + "loss": 0.5977, + "step": 6645 + }, + { + "epoch": 0.52, + "grad_norm": 1.1479942867854638, + "learning_rate": 9.981154978040636e-06, + "loss": 0.5413, + "step": 6646 + }, + { + "epoch": 0.52, + "grad_norm": 1.1383323621303572, + "learning_rate": 9.978642312041843e-06, + "loss": 0.6055, + "step": 6647 + }, + { + "epoch": 0.52, + "grad_norm": 1.176632899069099, + "learning_rate": 9.976129647391471e-06, + "loss": 0.5126, + "step": 6648 + }, + { + "epoch": 0.52, + "grad_norm": 1.1551517219305074, + "learning_rate": 9.973616984248162e-06, + "loss": 0.5925, + "step": 6649 + }, + { + "epoch": 0.52, + "grad_norm": 1.2088785202200736, + "learning_rate": 9.971104322770545e-06, + "loss": 0.6194, + "step": 6650 + }, + { + "epoch": 0.52, + "grad_norm": 1.1984971947953453, + "learning_rate": 9.968591663117263e-06, + "loss": 0.485, + "step": 6651 + }, + { + "epoch": 0.52, + "grad_norm": 1.1998027659293227, + "learning_rate": 9.966079005446949e-06, + "loss": 0.5658, + "step": 6652 + }, + { + "epoch": 0.52, + "grad_norm": 1.2112554317388005, + "learning_rate": 9.963566349918243e-06, + "loss": 0.5284, + "step": 6653 + }, + { + "epoch": 0.52, + "grad_norm": 1.0795774213265117, + "learning_rate": 9.961053696689785e-06, + "loss": 0.464, + "step": 6654 + }, + { + "epoch": 0.52, + "grad_norm": 1.3131545569076983, + "learning_rate": 9.958541045920204e-06, + "loss": 0.5983, + "step": 6655 + }, + { + "epoch": 0.52, + "grad_norm": 1.0439433141534018, + "learning_rate": 9.956028397768143e-06, + "loss": 0.5037, + "step": 6656 + }, + { + "epoch": 0.52, + "grad_norm": 1.0522319323655531, + "learning_rate": 9.953515752392233e-06, + "loss": 0.5325, + "step": 6657 + }, + { + "epoch": 0.52, + "grad_norm": 1.0810909110102431, + "learning_rate": 9.951003109951114e-06, + "loss": 0.5279, + "step": 6658 + }, + { + "epoch": 0.52, + "grad_norm": 1.109893784982816, + "learning_rate": 9.948490470603425e-06, + "loss": 0.5258, + "step": 6659 + }, + { + "epoch": 0.52, + "grad_norm": 1.1533490467287377, + "learning_rate": 9.945977834507796e-06, + "loss": 0.5382, + "step": 6660 + }, + { + "epoch": 0.52, + "grad_norm": 1.0864777353272719, + "learning_rate": 9.943465201822865e-06, + "loss": 0.5156, + "step": 6661 + }, + { + "epoch": 0.52, + "grad_norm": 1.116589899887985, + "learning_rate": 9.940952572707267e-06, + "loss": 0.5135, + "step": 6662 + }, + { + "epoch": 0.52, + "grad_norm": 1.1224579701700543, + "learning_rate": 9.938439947319639e-06, + "loss": 0.5442, + "step": 6663 + }, + { + "epoch": 0.52, + "grad_norm": 1.2587810601926352, + "learning_rate": 9.935927325818616e-06, + "loss": 0.5419, + "step": 6664 + }, + { + "epoch": 0.52, + "grad_norm": 1.0638526832876256, + "learning_rate": 9.933414708362829e-06, + "loss": 0.548, + "step": 6665 + }, + { + "epoch": 0.52, + "grad_norm": 1.3376120172812929, + "learning_rate": 9.930902095110916e-06, + "loss": 0.5877, + "step": 6666 + }, + { + "epoch": 0.52, + "grad_norm": 1.0784191891208117, + "learning_rate": 9.928389486221507e-06, + "loss": 0.49, + "step": 6667 + }, + { + "epoch": 0.52, + "grad_norm": 1.232844695817907, + "learning_rate": 9.925876881853242e-06, + "loss": 0.5598, + "step": 6668 + }, + { + "epoch": 0.52, + "grad_norm": 1.2627350106898854, + "learning_rate": 9.92336428216475e-06, + "loss": 0.6332, + "step": 6669 + }, + { + "epoch": 0.52, + "grad_norm": 1.175360670940941, + "learning_rate": 9.920851687314673e-06, + "loss": 0.5429, + "step": 6670 + }, + { + "epoch": 0.52, + "grad_norm": 1.2129608477966873, + "learning_rate": 9.918339097461631e-06, + "loss": 0.5597, + "step": 6671 + }, + { + "epoch": 0.52, + "grad_norm": 1.2944959397412539, + "learning_rate": 9.915826512764265e-06, + "loss": 0.5731, + "step": 6672 + }, + { + "epoch": 0.52, + "grad_norm": 1.1980466799740528, + "learning_rate": 9.913313933381203e-06, + "loss": 0.5604, + "step": 6673 + }, + { + "epoch": 0.52, + "grad_norm": 1.1586571972503563, + "learning_rate": 9.91080135947108e-06, + "loss": 0.5591, + "step": 6674 + }, + { + "epoch": 0.52, + "grad_norm": 1.2079291106038474, + "learning_rate": 9.908288791192532e-06, + "loss": 0.6064, + "step": 6675 + }, + { + "epoch": 0.52, + "grad_norm": 1.1589997046587932, + "learning_rate": 9.905776228704185e-06, + "loss": 0.5198, + "step": 6676 + }, + { + "epoch": 0.52, + "grad_norm": 1.0285197442273248, + "learning_rate": 9.903263672164668e-06, + "loss": 0.5242, + "step": 6677 + }, + { + "epoch": 0.52, + "grad_norm": 1.135374657556637, + "learning_rate": 9.900751121732613e-06, + "loss": 0.5524, + "step": 6678 + }, + { + "epoch": 0.52, + "grad_norm": 1.1864209040542244, + "learning_rate": 9.898238577566654e-06, + "loss": 0.5433, + "step": 6679 + }, + { + "epoch": 0.52, + "grad_norm": 1.193954943495299, + "learning_rate": 9.895726039825421e-06, + "loss": 0.5374, + "step": 6680 + }, + { + "epoch": 0.52, + "grad_norm": 1.1034898603014098, + "learning_rate": 9.89321350866754e-06, + "loss": 0.5038, + "step": 6681 + }, + { + "epoch": 0.52, + "grad_norm": 1.1904055035131913, + "learning_rate": 9.890700984251638e-06, + "loss": 0.5619, + "step": 6682 + }, + { + "epoch": 0.52, + "grad_norm": 1.0613637907671543, + "learning_rate": 9.888188466736347e-06, + "loss": 0.5205, + "step": 6683 + }, + { + "epoch": 0.52, + "grad_norm": 1.1402003921928328, + "learning_rate": 9.885675956280295e-06, + "loss": 0.5368, + "step": 6684 + }, + { + "epoch": 0.52, + "grad_norm": 1.2505179762526284, + "learning_rate": 9.883163453042115e-06, + "loss": 0.6086, + "step": 6685 + }, + { + "epoch": 0.52, + "grad_norm": 1.2176526827315106, + "learning_rate": 9.880650957180427e-06, + "loss": 0.5601, + "step": 6686 + }, + { + "epoch": 0.52, + "grad_norm": 1.1829621798547385, + "learning_rate": 9.878138468853856e-06, + "loss": 0.5446, + "step": 6687 + }, + { + "epoch": 0.52, + "grad_norm": 1.1524636933572054, + "learning_rate": 9.875625988221033e-06, + "loss": 0.5235, + "step": 6688 + }, + { + "epoch": 0.52, + "grad_norm": 1.104295249265777, + "learning_rate": 9.873113515440583e-06, + "loss": 0.542, + "step": 6689 + }, + { + "epoch": 0.52, + "grad_norm": 1.1996496960823801, + "learning_rate": 9.870601050671137e-06, + "loss": 0.5884, + "step": 6690 + }, + { + "epoch": 0.52, + "grad_norm": 1.158612852652886, + "learning_rate": 9.86808859407131e-06, + "loss": 0.5379, + "step": 6691 + }, + { + "epoch": 0.52, + "grad_norm": 1.2031040437532907, + "learning_rate": 9.865576145799729e-06, + "loss": 0.5425, + "step": 6692 + }, + { + "epoch": 0.52, + "grad_norm": 1.1316717784150023, + "learning_rate": 9.86306370601502e-06, + "loss": 0.517, + "step": 6693 + }, + { + "epoch": 0.52, + "grad_norm": 1.325490470293655, + "learning_rate": 9.860551274875806e-06, + "loss": 0.5813, + "step": 6694 + }, + { + "epoch": 0.52, + "grad_norm": 1.2344710880013376, + "learning_rate": 9.858038852540713e-06, + "loss": 0.5207, + "step": 6695 + }, + { + "epoch": 0.52, + "grad_norm": 1.1529321752253516, + "learning_rate": 9.855526439168359e-06, + "loss": 0.5231, + "step": 6696 + }, + { + "epoch": 0.52, + "grad_norm": 1.1133799726649902, + "learning_rate": 9.853014034917361e-06, + "loss": 0.5354, + "step": 6697 + }, + { + "epoch": 0.52, + "grad_norm": 1.2422150422887546, + "learning_rate": 9.850501639946349e-06, + "loss": 0.5913, + "step": 6698 + }, + { + "epoch": 0.52, + "grad_norm": 1.2543223038397229, + "learning_rate": 9.847989254413938e-06, + "loss": 0.565, + "step": 6699 + }, + { + "epoch": 0.52, + "grad_norm": 1.172135947102419, + "learning_rate": 9.845476878478754e-06, + "loss": 0.5242, + "step": 6700 + }, + { + "epoch": 0.52, + "grad_norm": 1.3321386389079346, + "learning_rate": 9.84296451229941e-06, + "loss": 0.608, + "step": 6701 + }, + { + "epoch": 0.52, + "grad_norm": 1.127404557225205, + "learning_rate": 9.840452156034523e-06, + "loss": 0.5198, + "step": 6702 + }, + { + "epoch": 0.52, + "grad_norm": 1.128997799022276, + "learning_rate": 9.837939809842717e-06, + "loss": 0.4907, + "step": 6703 + }, + { + "epoch": 0.52, + "grad_norm": 1.0543452519967167, + "learning_rate": 9.835427473882606e-06, + "loss": 0.5482, + "step": 6704 + }, + { + "epoch": 0.52, + "grad_norm": 1.1739258685808915, + "learning_rate": 9.83291514831281e-06, + "loss": 0.5725, + "step": 6705 + }, + { + "epoch": 0.52, + "grad_norm": 1.18268084177563, + "learning_rate": 9.830402833291942e-06, + "loss": 0.5413, + "step": 6706 + }, + { + "epoch": 0.52, + "grad_norm": 1.1920730975895073, + "learning_rate": 9.827890528978617e-06, + "loss": 0.574, + "step": 6707 + }, + { + "epoch": 0.52, + "grad_norm": 1.1162062917697777, + "learning_rate": 9.82537823553145e-06, + "loss": 0.541, + "step": 6708 + }, + { + "epoch": 0.52, + "grad_norm": 1.2010820060543905, + "learning_rate": 9.822865953109055e-06, + "loss": 0.5556, + "step": 6709 + }, + { + "epoch": 0.52, + "grad_norm": 1.1876201568848452, + "learning_rate": 9.820353681870052e-06, + "loss": 0.5575, + "step": 6710 + }, + { + "epoch": 0.52, + "grad_norm": 1.1585664999310887, + "learning_rate": 9.817841421973046e-06, + "loss": 0.534, + "step": 6711 + }, + { + "epoch": 0.52, + "grad_norm": 1.1678580037051753, + "learning_rate": 9.815329173576648e-06, + "loss": 0.5664, + "step": 6712 + }, + { + "epoch": 0.52, + "grad_norm": 1.1033214296945408, + "learning_rate": 9.812816936839472e-06, + "loss": 0.528, + "step": 6713 + }, + { + "epoch": 0.52, + "grad_norm": 1.1152892090900546, + "learning_rate": 9.810304711920127e-06, + "loss": 0.5323, + "step": 6714 + }, + { + "epoch": 0.52, + "grad_norm": 1.1848273065334767, + "learning_rate": 9.80779249897723e-06, + "loss": 0.5652, + "step": 6715 + }, + { + "epoch": 0.52, + "grad_norm": 1.1234445415164733, + "learning_rate": 9.805280298169383e-06, + "loss": 0.562, + "step": 6716 + }, + { + "epoch": 0.52, + "grad_norm": 1.0949469964598548, + "learning_rate": 9.802768109655192e-06, + "loss": 0.5287, + "step": 6717 + }, + { + "epoch": 0.52, + "grad_norm": 1.250375881900646, + "learning_rate": 9.800255933593269e-06, + "loss": 0.5525, + "step": 6718 + }, + { + "epoch": 0.52, + "grad_norm": 1.0301054758860864, + "learning_rate": 9.797743770142219e-06, + "loss": 0.5037, + "step": 6719 + }, + { + "epoch": 0.52, + "grad_norm": 1.1472910653846946, + "learning_rate": 9.795231619460652e-06, + "loss": 0.5301, + "step": 6720 + }, + { + "epoch": 0.52, + "grad_norm": 1.0766018598694587, + "learning_rate": 9.792719481707168e-06, + "loss": 0.4882, + "step": 6721 + }, + { + "epoch": 0.52, + "grad_norm": 1.1165987077122363, + "learning_rate": 9.79020735704037e-06, + "loss": 0.506, + "step": 6722 + }, + { + "epoch": 0.52, + "grad_norm": 1.0982313176959915, + "learning_rate": 9.787695245618864e-06, + "loss": 0.5402, + "step": 6723 + }, + { + "epoch": 0.52, + "grad_norm": 1.104722974496296, + "learning_rate": 9.785183147601252e-06, + "loss": 0.534, + "step": 6724 + }, + { + "epoch": 0.52, + "grad_norm": 1.1942002850255082, + "learning_rate": 9.782671063146138e-06, + "loss": 0.5297, + "step": 6725 + }, + { + "epoch": 0.52, + "grad_norm": 1.2304632762000023, + "learning_rate": 9.780158992412124e-06, + "loss": 0.5548, + "step": 6726 + }, + { + "epoch": 0.52, + "grad_norm": 1.038458110594458, + "learning_rate": 9.777646935557802e-06, + "loss": 0.5134, + "step": 6727 + }, + { + "epoch": 0.52, + "grad_norm": 1.2464058225166397, + "learning_rate": 9.775134892741778e-06, + "loss": 0.5645, + "step": 6728 + }, + { + "epoch": 0.52, + "grad_norm": 1.1757873332224278, + "learning_rate": 9.772622864122645e-06, + "loss": 0.5481, + "step": 6729 + }, + { + "epoch": 0.52, + "grad_norm": 1.0662611977994314, + "learning_rate": 9.770110849859009e-06, + "loss": 0.5524, + "step": 6730 + }, + { + "epoch": 0.52, + "grad_norm": 1.1548211832051936, + "learning_rate": 9.767598850109463e-06, + "loss": 0.5567, + "step": 6731 + }, + { + "epoch": 0.52, + "grad_norm": 1.1617516231439333, + "learning_rate": 9.765086865032596e-06, + "loss": 0.4977, + "step": 6732 + }, + { + "epoch": 0.52, + "grad_norm": 1.3047199359448107, + "learning_rate": 9.762574894787006e-06, + "loss": 0.5427, + "step": 6733 + }, + { + "epoch": 0.52, + "grad_norm": 1.203732126802435, + "learning_rate": 9.760062939531289e-06, + "loss": 0.5459, + "step": 6734 + }, + { + "epoch": 0.52, + "grad_norm": 1.1284393084038036, + "learning_rate": 9.757550999424038e-06, + "loss": 0.5466, + "step": 6735 + }, + { + "epoch": 0.52, + "grad_norm": 1.1869552266193888, + "learning_rate": 9.755039074623846e-06, + "loss": 0.5742, + "step": 6736 + }, + { + "epoch": 0.52, + "grad_norm": 1.632009865642161, + "learning_rate": 9.752527165289297e-06, + "loss": 0.6006, + "step": 6737 + }, + { + "epoch": 0.52, + "grad_norm": 1.2485300480958519, + "learning_rate": 9.750015271578982e-06, + "loss": 0.5613, + "step": 6738 + }, + { + "epoch": 0.52, + "grad_norm": 1.1568116035587794, + "learning_rate": 9.747503393651499e-06, + "loss": 0.5488, + "step": 6739 + }, + { + "epoch": 0.52, + "grad_norm": 1.0118085313234109, + "learning_rate": 9.744991531665425e-06, + "loss": 0.5363, + "step": 6740 + }, + { + "epoch": 0.52, + "grad_norm": 1.1239115960152977, + "learning_rate": 9.742479685779356e-06, + "loss": 0.5595, + "step": 6741 + }, + { + "epoch": 0.52, + "grad_norm": 1.262332306479604, + "learning_rate": 9.739967856151868e-06, + "loss": 0.5743, + "step": 6742 + }, + { + "epoch": 0.52, + "grad_norm": 1.251172660091367, + "learning_rate": 9.737456042941552e-06, + "loss": 0.5425, + "step": 6743 + }, + { + "epoch": 0.52, + "grad_norm": 1.156354538599304, + "learning_rate": 9.734944246306994e-06, + "loss": 0.4942, + "step": 6744 + }, + { + "epoch": 0.52, + "grad_norm": 1.2061475463208424, + "learning_rate": 9.732432466406769e-06, + "loss": 0.5433, + "step": 6745 + }, + { + "epoch": 0.52, + "grad_norm": 1.1490041152442163, + "learning_rate": 9.729920703399468e-06, + "loss": 0.5294, + "step": 6746 + }, + { + "epoch": 0.52, + "grad_norm": 1.170693730395212, + "learning_rate": 9.727408957443661e-06, + "loss": 0.5873, + "step": 6747 + }, + { + "epoch": 0.52, + "grad_norm": 1.1811728316032652, + "learning_rate": 9.724897228697933e-06, + "loss": 0.5533, + "step": 6748 + }, + { + "epoch": 0.52, + "grad_norm": 1.2222789633228246, + "learning_rate": 9.722385517320866e-06, + "loss": 0.5175, + "step": 6749 + }, + { + "epoch": 0.52, + "grad_norm": 1.0903614960997197, + "learning_rate": 9.71987382347103e-06, + "loss": 0.5097, + "step": 6750 + }, + { + "epoch": 0.52, + "grad_norm": 1.103902455274103, + "learning_rate": 9.717362147307009e-06, + "loss": 0.5288, + "step": 6751 + }, + { + "epoch": 0.52, + "grad_norm": 1.1689437827958349, + "learning_rate": 9.714850488987371e-06, + "loss": 0.531, + "step": 6752 + }, + { + "epoch": 0.52, + "grad_norm": 1.1734801425627166, + "learning_rate": 9.712338848670691e-06, + "loss": 0.5627, + "step": 6753 + }, + { + "epoch": 0.52, + "grad_norm": 1.108512381860031, + "learning_rate": 9.709827226515547e-06, + "loss": 0.4915, + "step": 6754 + }, + { + "epoch": 0.52, + "grad_norm": 1.0919641356395131, + "learning_rate": 9.707315622680505e-06, + "loss": 0.4829, + "step": 6755 + }, + { + "epoch": 0.52, + "grad_norm": 1.157064923936836, + "learning_rate": 9.70480403732414e-06, + "loss": 0.5811, + "step": 6756 + }, + { + "epoch": 0.52, + "grad_norm": 1.182154419224635, + "learning_rate": 9.702292470605017e-06, + "loss": 0.562, + "step": 6757 + }, + { + "epoch": 0.52, + "grad_norm": 1.1521575260014056, + "learning_rate": 9.699780922681707e-06, + "loss": 0.6059, + "step": 6758 + }, + { + "epoch": 0.52, + "grad_norm": 1.1294052446766776, + "learning_rate": 9.697269393712779e-06, + "loss": 0.5141, + "step": 6759 + }, + { + "epoch": 0.52, + "grad_norm": 1.1084767318162467, + "learning_rate": 9.694757883856791e-06, + "loss": 0.4807, + "step": 6760 + }, + { + "epoch": 0.52, + "grad_norm": 1.081503012717782, + "learning_rate": 9.69224639327232e-06, + "loss": 0.5312, + "step": 6761 + }, + { + "epoch": 0.52, + "grad_norm": 1.1142215973621492, + "learning_rate": 9.689734922117915e-06, + "loss": 0.4868, + "step": 6762 + }, + { + "epoch": 0.52, + "grad_norm": 1.1839190750464534, + "learning_rate": 9.687223470552146e-06, + "loss": 0.545, + "step": 6763 + }, + { + "epoch": 0.52, + "grad_norm": 1.2167807588688702, + "learning_rate": 9.684712038733575e-06, + "loss": 0.547, + "step": 6764 + }, + { + "epoch": 0.52, + "grad_norm": 1.1254207036382493, + "learning_rate": 9.682200626820758e-06, + "loss": 0.5596, + "step": 6765 + }, + { + "epoch": 0.52, + "grad_norm": 1.1276042253994945, + "learning_rate": 9.679689234972259e-06, + "loss": 0.5294, + "step": 6766 + }, + { + "epoch": 0.52, + "grad_norm": 1.1455863802113557, + "learning_rate": 9.677177863346627e-06, + "loss": 0.5391, + "step": 6767 + }, + { + "epoch": 0.53, + "grad_norm": 1.1438606041965453, + "learning_rate": 9.674666512102423e-06, + "loss": 0.4781, + "step": 6768 + }, + { + "epoch": 0.53, + "grad_norm": 1.190637609460096, + "learning_rate": 9.672155181398201e-06, + "loss": 0.5484, + "step": 6769 + }, + { + "epoch": 0.53, + "grad_norm": 1.0915448438700075, + "learning_rate": 9.669643871392513e-06, + "loss": 0.4701, + "step": 6770 + }, + { + "epoch": 0.53, + "grad_norm": 1.32897414942817, + "learning_rate": 9.667132582243916e-06, + "loss": 0.5755, + "step": 6771 + }, + { + "epoch": 0.53, + "grad_norm": 1.2611725277353434, + "learning_rate": 9.664621314110952e-06, + "loss": 0.5244, + "step": 6772 + }, + { + "epoch": 0.53, + "grad_norm": 1.109609498505618, + "learning_rate": 9.662110067152173e-06, + "loss": 0.4966, + "step": 6773 + }, + { + "epoch": 0.53, + "grad_norm": 1.1127352797923649, + "learning_rate": 9.659598841526133e-06, + "loss": 0.5074, + "step": 6774 + }, + { + "epoch": 0.53, + "grad_norm": 1.0763404561840126, + "learning_rate": 9.657087637391372e-06, + "loss": 0.4972, + "step": 6775 + }, + { + "epoch": 0.53, + "grad_norm": 1.1935854134266628, + "learning_rate": 9.654576454906437e-06, + "loss": 0.5213, + "step": 6776 + }, + { + "epoch": 0.53, + "grad_norm": 1.2369850667177, + "learning_rate": 9.652065294229877e-06, + "loss": 0.5326, + "step": 6777 + }, + { + "epoch": 0.53, + "grad_norm": 1.1852745987826399, + "learning_rate": 9.649554155520227e-06, + "loss": 0.5173, + "step": 6778 + }, + { + "epoch": 0.53, + "grad_norm": 1.0512529753689384, + "learning_rate": 9.647043038936033e-06, + "loss": 0.5297, + "step": 6779 + }, + { + "epoch": 0.53, + "grad_norm": 1.1237311625360409, + "learning_rate": 9.644531944635829e-06, + "loss": 0.5414, + "step": 6780 + }, + { + "epoch": 0.53, + "grad_norm": 1.3321954172942962, + "learning_rate": 9.64202087277816e-06, + "loss": 0.604, + "step": 6781 + }, + { + "epoch": 0.53, + "grad_norm": 1.098598361004965, + "learning_rate": 9.639509823521562e-06, + "loss": 0.5335, + "step": 6782 + }, + { + "epoch": 0.53, + "grad_norm": 1.2298731740059234, + "learning_rate": 9.636998797024566e-06, + "loss": 0.5467, + "step": 6783 + }, + { + "epoch": 0.53, + "grad_norm": 1.092405582861112, + "learning_rate": 9.634487793445711e-06, + "loss": 0.552, + "step": 6784 + }, + { + "epoch": 0.53, + "grad_norm": 1.0590108352445429, + "learning_rate": 9.631976812943523e-06, + "loss": 0.468, + "step": 6785 + }, + { + "epoch": 0.53, + "grad_norm": 1.1622103607153123, + "learning_rate": 9.62946585567654e-06, + "loss": 0.5303, + "step": 6786 + }, + { + "epoch": 0.53, + "grad_norm": 1.2147668648581655, + "learning_rate": 9.626954921803294e-06, + "loss": 0.5373, + "step": 6787 + }, + { + "epoch": 0.53, + "grad_norm": 1.1345811913944404, + "learning_rate": 9.624444011482302e-06, + "loss": 0.5027, + "step": 6788 + }, + { + "epoch": 0.53, + "grad_norm": 1.3148610812590216, + "learning_rate": 9.6219331248721e-06, + "loss": 0.6111, + "step": 6789 + }, + { + "epoch": 0.53, + "grad_norm": 1.120634775879702, + "learning_rate": 9.619422262131208e-06, + "loss": 0.4998, + "step": 6790 + }, + { + "epoch": 0.53, + "grad_norm": 1.18938006096794, + "learning_rate": 9.616911423418153e-06, + "loss": 0.5491, + "step": 6791 + }, + { + "epoch": 0.53, + "grad_norm": 1.3028522764096522, + "learning_rate": 9.61440060889146e-06, + "loss": 0.6272, + "step": 6792 + }, + { + "epoch": 0.53, + "grad_norm": 1.0668678262890638, + "learning_rate": 9.611889818709643e-06, + "loss": 0.5332, + "step": 6793 + }, + { + "epoch": 0.53, + "grad_norm": 1.1850208202577182, + "learning_rate": 9.609379053031224e-06, + "loss": 0.5584, + "step": 6794 + }, + { + "epoch": 0.53, + "grad_norm": 1.1292958362829233, + "learning_rate": 9.60686831201472e-06, + "loss": 0.5175, + "step": 6795 + }, + { + "epoch": 0.53, + "grad_norm": 1.1613028155615366, + "learning_rate": 9.604357595818647e-06, + "loss": 0.5409, + "step": 6796 + }, + { + "epoch": 0.53, + "grad_norm": 1.1523969217737962, + "learning_rate": 9.601846904601526e-06, + "loss": 0.608, + "step": 6797 + }, + { + "epoch": 0.53, + "grad_norm": 1.1970531779198983, + "learning_rate": 9.599336238521859e-06, + "loss": 0.5111, + "step": 6798 + }, + { + "epoch": 0.53, + "grad_norm": 1.1778648898810051, + "learning_rate": 9.596825597738164e-06, + "loss": 0.5495, + "step": 6799 + }, + { + "epoch": 0.53, + "grad_norm": 1.1870283896916536, + "learning_rate": 9.594314982408947e-06, + "loss": 0.5553, + "step": 6800 + }, + { + "epoch": 0.53, + "grad_norm": 1.1901931840700481, + "learning_rate": 9.591804392692719e-06, + "loss": 0.5627, + "step": 6801 + }, + { + "epoch": 0.53, + "grad_norm": 1.1053506093874272, + "learning_rate": 9.589293828747988e-06, + "loss": 0.5174, + "step": 6802 + }, + { + "epoch": 0.53, + "grad_norm": 1.2732401969977203, + "learning_rate": 9.586783290733254e-06, + "loss": 0.5677, + "step": 6803 + }, + { + "epoch": 0.53, + "grad_norm": 1.1656762528593017, + "learning_rate": 9.584272778807026e-06, + "loss": 0.5585, + "step": 6804 + }, + { + "epoch": 0.53, + "grad_norm": 1.1832376580222328, + "learning_rate": 9.581762293127798e-06, + "loss": 0.5162, + "step": 6805 + }, + { + "epoch": 0.53, + "grad_norm": 1.0895200952932398, + "learning_rate": 9.579251833854076e-06, + "loss": 0.5438, + "step": 6806 + }, + { + "epoch": 0.53, + "grad_norm": 1.126061786162506, + "learning_rate": 9.57674140114436e-06, + "loss": 0.5528, + "step": 6807 + }, + { + "epoch": 0.53, + "grad_norm": 1.2020671206504352, + "learning_rate": 9.574230995157137e-06, + "loss": 0.5021, + "step": 6808 + }, + { + "epoch": 0.53, + "grad_norm": 1.2667014642168681, + "learning_rate": 9.571720616050913e-06, + "loss": 0.5896, + "step": 6809 + }, + { + "epoch": 0.53, + "grad_norm": 1.1196191107699502, + "learning_rate": 9.569210263984173e-06, + "loss": 0.517, + "step": 6810 + }, + { + "epoch": 0.53, + "grad_norm": 1.0555079906744835, + "learning_rate": 9.566699939115412e-06, + "loss": 0.5122, + "step": 6811 + }, + { + "epoch": 0.53, + "grad_norm": 1.1921523962689797, + "learning_rate": 9.564189641603123e-06, + "loss": 0.5579, + "step": 6812 + }, + { + "epoch": 0.53, + "grad_norm": 1.2114677498807633, + "learning_rate": 9.561679371605786e-06, + "loss": 0.5154, + "step": 6813 + }, + { + "epoch": 0.53, + "grad_norm": 1.132669584532855, + "learning_rate": 9.559169129281893e-06, + "loss": 0.5237, + "step": 6814 + }, + { + "epoch": 0.53, + "grad_norm": 1.1251716482822274, + "learning_rate": 9.556658914789926e-06, + "loss": 0.5107, + "step": 6815 + }, + { + "epoch": 0.53, + "grad_norm": 1.1155361962971095, + "learning_rate": 9.554148728288371e-06, + "loss": 0.488, + "step": 6816 + }, + { + "epoch": 0.53, + "grad_norm": 1.2365542622316654, + "learning_rate": 9.551638569935708e-06, + "loss": 0.5538, + "step": 6817 + }, + { + "epoch": 0.53, + "grad_norm": 1.1413754124167022, + "learning_rate": 9.549128439890413e-06, + "loss": 0.5174, + "step": 6818 + }, + { + "epoch": 0.53, + "grad_norm": 1.210704461100857, + "learning_rate": 9.546618338310968e-06, + "loss": 0.5515, + "step": 6819 + }, + { + "epoch": 0.53, + "grad_norm": 1.158660643917127, + "learning_rate": 9.544108265355843e-06, + "loss": 0.5085, + "step": 6820 + }, + { + "epoch": 0.53, + "grad_norm": 1.2350867911201326, + "learning_rate": 9.541598221183516e-06, + "loss": 0.5753, + "step": 6821 + }, + { + "epoch": 0.53, + "grad_norm": 1.192111247612642, + "learning_rate": 9.539088205952463e-06, + "loss": 0.4856, + "step": 6822 + }, + { + "epoch": 0.53, + "grad_norm": 1.1520995835071648, + "learning_rate": 9.536578219821146e-06, + "loss": 0.4988, + "step": 6823 + }, + { + "epoch": 0.53, + "grad_norm": 1.298774477202078, + "learning_rate": 9.53406826294804e-06, + "loss": 0.5703, + "step": 6824 + }, + { + "epoch": 0.53, + "grad_norm": 1.22550797537454, + "learning_rate": 9.531558335491602e-06, + "loss": 0.531, + "step": 6825 + }, + { + "epoch": 0.53, + "grad_norm": 1.168792587157156, + "learning_rate": 9.529048437610307e-06, + "loss": 0.5501, + "step": 6826 + }, + { + "epoch": 0.53, + "grad_norm": 1.2242872675434198, + "learning_rate": 9.526538569462617e-06, + "loss": 0.5519, + "step": 6827 + }, + { + "epoch": 0.53, + "grad_norm": 1.2389060288447225, + "learning_rate": 9.524028731206984e-06, + "loss": 0.5388, + "step": 6828 + }, + { + "epoch": 0.53, + "grad_norm": 1.1037690808384706, + "learning_rate": 9.521518923001877e-06, + "loss": 0.446, + "step": 6829 + }, + { + "epoch": 0.53, + "grad_norm": 1.1709760651296448, + "learning_rate": 9.519009145005747e-06, + "loss": 0.5549, + "step": 6830 + }, + { + "epoch": 0.53, + "grad_norm": 1.1214836585648127, + "learning_rate": 9.516499397377049e-06, + "loss": 0.5331, + "step": 6831 + }, + { + "epoch": 0.53, + "grad_norm": 1.1658898673939344, + "learning_rate": 9.513989680274241e-06, + "loss": 0.5721, + "step": 6832 + }, + { + "epoch": 0.53, + "grad_norm": 1.1407743904838614, + "learning_rate": 9.511479993855776e-06, + "loss": 0.5538, + "step": 6833 + }, + { + "epoch": 0.53, + "grad_norm": 1.2103429011302314, + "learning_rate": 9.508970338280097e-06, + "loss": 0.5871, + "step": 6834 + }, + { + "epoch": 0.53, + "grad_norm": 1.1783403194783333, + "learning_rate": 9.50646071370565e-06, + "loss": 0.563, + "step": 6835 + }, + { + "epoch": 0.53, + "grad_norm": 1.25163257797966, + "learning_rate": 9.503951120290886e-06, + "loss": 0.5629, + "step": 6836 + }, + { + "epoch": 0.53, + "grad_norm": 1.0621521324202032, + "learning_rate": 9.501441558194247e-06, + "loss": 0.5365, + "step": 6837 + }, + { + "epoch": 0.53, + "grad_norm": 1.3286018413147922, + "learning_rate": 9.498932027574179e-06, + "loss": 0.5943, + "step": 6838 + }, + { + "epoch": 0.53, + "grad_norm": 1.140774860727318, + "learning_rate": 9.496422528589115e-06, + "loss": 0.5767, + "step": 6839 + }, + { + "epoch": 0.53, + "grad_norm": 1.2509644125855384, + "learning_rate": 9.493913061397493e-06, + "loss": 0.5278, + "step": 6840 + }, + { + "epoch": 0.53, + "grad_norm": 1.0820306440551384, + "learning_rate": 9.49140362615775e-06, + "loss": 0.5561, + "step": 6841 + }, + { + "epoch": 0.53, + "grad_norm": 1.1487220359777455, + "learning_rate": 9.488894223028322e-06, + "loss": 0.5719, + "step": 6842 + }, + { + "epoch": 0.53, + "grad_norm": 1.2308671578778951, + "learning_rate": 9.48638485216764e-06, + "loss": 0.5494, + "step": 6843 + }, + { + "epoch": 0.53, + "grad_norm": 1.1264529382599893, + "learning_rate": 9.483875513734131e-06, + "loss": 0.5611, + "step": 6844 + }, + { + "epoch": 0.53, + "grad_norm": 1.1681477081067346, + "learning_rate": 9.481366207886223e-06, + "loss": 0.5515, + "step": 6845 + }, + { + "epoch": 0.53, + "grad_norm": 1.1517450878882316, + "learning_rate": 9.47885693478234e-06, + "loss": 0.5623, + "step": 6846 + }, + { + "epoch": 0.53, + "grad_norm": 1.1045334164912133, + "learning_rate": 9.476347694580911e-06, + "loss": 0.5322, + "step": 6847 + }, + { + "epoch": 0.53, + "grad_norm": 1.156335363587699, + "learning_rate": 9.473838487440354e-06, + "loss": 0.5257, + "step": 6848 + }, + { + "epoch": 0.53, + "grad_norm": 1.1668430376706829, + "learning_rate": 9.471329313519086e-06, + "loss": 0.5719, + "step": 6849 + }, + { + "epoch": 0.53, + "grad_norm": 1.1390214910045442, + "learning_rate": 9.468820172975525e-06, + "loss": 0.5465, + "step": 6850 + }, + { + "epoch": 0.53, + "grad_norm": 1.3029624361872645, + "learning_rate": 9.466311065968088e-06, + "loss": 0.5726, + "step": 6851 + }, + { + "epoch": 0.53, + "grad_norm": 1.0708037870811096, + "learning_rate": 9.463801992655187e-06, + "loss": 0.5075, + "step": 6852 + }, + { + "epoch": 0.53, + "grad_norm": 1.1286208376062532, + "learning_rate": 9.461292953195234e-06, + "loss": 0.5303, + "step": 6853 + }, + { + "epoch": 0.53, + "grad_norm": 1.1378655695096467, + "learning_rate": 9.458783947746635e-06, + "loss": 0.4978, + "step": 6854 + }, + { + "epoch": 0.53, + "grad_norm": 1.2296301027350895, + "learning_rate": 9.456274976467798e-06, + "loss": 0.5611, + "step": 6855 + }, + { + "epoch": 0.53, + "grad_norm": 1.2008565845572483, + "learning_rate": 9.453766039517126e-06, + "loss": 0.5897, + "step": 6856 + }, + { + "epoch": 0.53, + "grad_norm": 1.1358637789766757, + "learning_rate": 9.451257137053022e-06, + "loss": 0.5175, + "step": 6857 + }, + { + "epoch": 0.53, + "grad_norm": 1.0870034454417437, + "learning_rate": 9.448748269233889e-06, + "loss": 0.5717, + "step": 6858 + }, + { + "epoch": 0.53, + "grad_norm": 1.1680117698069887, + "learning_rate": 9.44623943621812e-06, + "loss": 0.562, + "step": 6859 + }, + { + "epoch": 0.53, + "grad_norm": 1.2131268797933095, + "learning_rate": 9.443730638164112e-06, + "loss": 0.5597, + "step": 6860 + }, + { + "epoch": 0.53, + "grad_norm": 1.161619195218998, + "learning_rate": 9.441221875230257e-06, + "loss": 0.5318, + "step": 6861 + }, + { + "epoch": 0.53, + "grad_norm": 1.193824889545189, + "learning_rate": 9.438713147574951e-06, + "loss": 0.5891, + "step": 6862 + }, + { + "epoch": 0.53, + "grad_norm": 1.2061538717281686, + "learning_rate": 9.43620445535658e-06, + "loss": 0.5521, + "step": 6863 + }, + { + "epoch": 0.53, + "grad_norm": 1.1358672423326084, + "learning_rate": 9.43369579873353e-06, + "loss": 0.5489, + "step": 6864 + }, + { + "epoch": 0.53, + "grad_norm": 1.112326014768923, + "learning_rate": 9.431187177864184e-06, + "loss": 0.5347, + "step": 6865 + }, + { + "epoch": 0.53, + "grad_norm": 1.2957539596452956, + "learning_rate": 9.428678592906925e-06, + "loss": 0.5118, + "step": 6866 + }, + { + "epoch": 0.53, + "grad_norm": 1.2199467383155636, + "learning_rate": 9.426170044020139e-06, + "loss": 0.5734, + "step": 6867 + }, + { + "epoch": 0.53, + "grad_norm": 1.2289500234171722, + "learning_rate": 9.423661531362197e-06, + "loss": 0.5753, + "step": 6868 + }, + { + "epoch": 0.53, + "grad_norm": 1.159313576605824, + "learning_rate": 9.421153055091477e-06, + "loss": 0.5575, + "step": 6869 + }, + { + "epoch": 0.53, + "grad_norm": 1.133401967289168, + "learning_rate": 9.418644615366346e-06, + "loss": 0.5535, + "step": 6870 + }, + { + "epoch": 0.53, + "grad_norm": 1.0925077195407658, + "learning_rate": 9.416136212345183e-06, + "loss": 0.5117, + "step": 6871 + }, + { + "epoch": 0.53, + "grad_norm": 1.1616052383553008, + "learning_rate": 9.413627846186354e-06, + "loss": 0.5611, + "step": 6872 + }, + { + "epoch": 0.53, + "grad_norm": 1.192798388994563, + "learning_rate": 9.411119517048226e-06, + "loss": 0.5265, + "step": 6873 + }, + { + "epoch": 0.53, + "grad_norm": 1.172370195666303, + "learning_rate": 9.40861122508916e-06, + "loss": 0.5269, + "step": 6874 + }, + { + "epoch": 0.53, + "grad_norm": 1.13235438061179, + "learning_rate": 9.406102970467515e-06, + "loss": 0.5494, + "step": 6875 + }, + { + "epoch": 0.53, + "grad_norm": 1.1302768082977968, + "learning_rate": 9.403594753341655e-06, + "loss": 0.5299, + "step": 6876 + }, + { + "epoch": 0.53, + "grad_norm": 1.1358724898214763, + "learning_rate": 9.401086573869937e-06, + "loss": 0.5687, + "step": 6877 + }, + { + "epoch": 0.53, + "grad_norm": 1.1984956033440617, + "learning_rate": 9.398578432210713e-06, + "loss": 0.5353, + "step": 6878 + }, + { + "epoch": 0.53, + "grad_norm": 1.262870664288372, + "learning_rate": 9.396070328522336e-06, + "loss": 0.5797, + "step": 6879 + }, + { + "epoch": 0.53, + "grad_norm": 1.2578837688466031, + "learning_rate": 9.393562262963154e-06, + "loss": 0.5696, + "step": 6880 + }, + { + "epoch": 0.53, + "grad_norm": 1.0445816769692662, + "learning_rate": 9.391054235691515e-06, + "loss": 0.4845, + "step": 6881 + }, + { + "epoch": 0.53, + "grad_norm": 1.1939792552724164, + "learning_rate": 9.388546246865764e-06, + "loss": 0.5524, + "step": 6882 + }, + { + "epoch": 0.53, + "grad_norm": 1.185410267031021, + "learning_rate": 9.386038296644245e-06, + "loss": 0.5581, + "step": 6883 + }, + { + "epoch": 0.53, + "grad_norm": 1.1838205957358947, + "learning_rate": 9.383530385185295e-06, + "loss": 0.5594, + "step": 6884 + }, + { + "epoch": 0.53, + "grad_norm": 1.2021331662184975, + "learning_rate": 9.381022512647251e-06, + "loss": 0.5972, + "step": 6885 + }, + { + "epoch": 0.53, + "grad_norm": 1.2612453552856213, + "learning_rate": 9.37851467918845e-06, + "loss": 0.5922, + "step": 6886 + }, + { + "epoch": 0.53, + "grad_norm": 1.115468977588695, + "learning_rate": 9.376006884967224e-06, + "loss": 0.473, + "step": 6887 + }, + { + "epoch": 0.53, + "grad_norm": 1.140491111944837, + "learning_rate": 9.373499130141901e-06, + "loss": 0.5621, + "step": 6888 + }, + { + "epoch": 0.53, + "grad_norm": 1.0999414948597301, + "learning_rate": 9.370991414870814e-06, + "loss": 0.5277, + "step": 6889 + }, + { + "epoch": 0.53, + "grad_norm": 1.1706286605557767, + "learning_rate": 9.368483739312281e-06, + "loss": 0.5493, + "step": 6890 + }, + { + "epoch": 0.53, + "grad_norm": 1.1421705252538015, + "learning_rate": 9.365976103624628e-06, + "loss": 0.554, + "step": 6891 + }, + { + "epoch": 0.53, + "grad_norm": 1.174512007001139, + "learning_rate": 9.363468507966175e-06, + "loss": 0.5224, + "step": 6892 + }, + { + "epoch": 0.53, + "grad_norm": 1.1151110151645698, + "learning_rate": 9.360960952495239e-06, + "loss": 0.5403, + "step": 6893 + }, + { + "epoch": 0.53, + "grad_norm": 1.2198132498656216, + "learning_rate": 9.35845343737014e-06, + "loss": 0.5749, + "step": 6894 + }, + { + "epoch": 0.53, + "grad_norm": 1.2073646751536584, + "learning_rate": 9.355945962749179e-06, + "loss": 0.5631, + "step": 6895 + }, + { + "epoch": 0.53, + "grad_norm": 1.1083731627418414, + "learning_rate": 9.353438528790673e-06, + "loss": 0.4935, + "step": 6896 + }, + { + "epoch": 0.54, + "grad_norm": 1.1268531054138002, + "learning_rate": 9.350931135652932e-06, + "loss": 0.5463, + "step": 6897 + }, + { + "epoch": 0.54, + "grad_norm": 1.2076929741427251, + "learning_rate": 9.348423783494253e-06, + "loss": 0.5921, + "step": 6898 + }, + { + "epoch": 0.54, + "grad_norm": 1.1776943421400163, + "learning_rate": 9.345916472472947e-06, + "loss": 0.5425, + "step": 6899 + }, + { + "epoch": 0.54, + "grad_norm": 1.2562159481462871, + "learning_rate": 9.343409202747306e-06, + "loss": 0.591, + "step": 6900 + }, + { + "epoch": 0.54, + "grad_norm": 1.0744898228649296, + "learning_rate": 9.340901974475627e-06, + "loss": 0.5436, + "step": 6901 + }, + { + "epoch": 0.54, + "grad_norm": 1.2216132837387694, + "learning_rate": 9.33839478781621e-06, + "loss": 0.5736, + "step": 6902 + }, + { + "epoch": 0.54, + "grad_norm": 1.1346967088582525, + "learning_rate": 9.335887642927342e-06, + "loss": 0.5508, + "step": 6903 + }, + { + "epoch": 0.54, + "grad_norm": 1.2722628271618521, + "learning_rate": 9.333380539967319e-06, + "loss": 0.5887, + "step": 6904 + }, + { + "epoch": 0.54, + "grad_norm": 1.1576385408546024, + "learning_rate": 9.330873479094415e-06, + "loss": 0.5157, + "step": 6905 + }, + { + "epoch": 0.54, + "grad_norm": 1.0669097829466203, + "learning_rate": 9.328366460466921e-06, + "loss": 0.5398, + "step": 6906 + }, + { + "epoch": 0.54, + "grad_norm": 1.140139228115547, + "learning_rate": 9.325859484243121e-06, + "loss": 0.552, + "step": 6907 + }, + { + "epoch": 0.54, + "grad_norm": 1.1109227948080684, + "learning_rate": 9.323352550581285e-06, + "loss": 0.4995, + "step": 6908 + }, + { + "epoch": 0.54, + "grad_norm": 1.323131688983571, + "learning_rate": 9.320845659639702e-06, + "loss": 0.6106, + "step": 6909 + }, + { + "epoch": 0.54, + "grad_norm": 1.1660169540729968, + "learning_rate": 9.31833881157663e-06, + "loss": 0.5586, + "step": 6910 + }, + { + "epoch": 0.54, + "grad_norm": 1.2727689802005524, + "learning_rate": 9.315832006550345e-06, + "loss": 0.5651, + "step": 6911 + }, + { + "epoch": 0.54, + "grad_norm": 1.2606625698015634, + "learning_rate": 9.313325244719117e-06, + "loss": 0.5756, + "step": 6912 + }, + { + "epoch": 0.54, + "grad_norm": 1.1544550805024634, + "learning_rate": 9.310818526241209e-06, + "loss": 0.515, + "step": 6913 + }, + { + "epoch": 0.54, + "grad_norm": 1.2779881896128449, + "learning_rate": 9.308311851274885e-06, + "loss": 0.5994, + "step": 6914 + }, + { + "epoch": 0.54, + "grad_norm": 1.2377965321386084, + "learning_rate": 9.3058052199784e-06, + "loss": 0.5549, + "step": 6915 + }, + { + "epoch": 0.54, + "grad_norm": 1.1371878478617607, + "learning_rate": 9.303298632510014e-06, + "loss": 0.537, + "step": 6916 + }, + { + "epoch": 0.54, + "grad_norm": 1.149749162518121, + "learning_rate": 9.30079208902798e-06, + "loss": 0.5475, + "step": 6917 + }, + { + "epoch": 0.54, + "grad_norm": 1.3233202471354304, + "learning_rate": 9.298285589690548e-06, + "loss": 0.5925, + "step": 6918 + }, + { + "epoch": 0.54, + "grad_norm": 1.063030951733851, + "learning_rate": 9.29577913465597e-06, + "loss": 0.5224, + "step": 6919 + }, + { + "epoch": 0.54, + "grad_norm": 1.1598342808374553, + "learning_rate": 9.293272724082484e-06, + "loss": 0.5138, + "step": 6920 + }, + { + "epoch": 0.54, + "grad_norm": 1.150487044071225, + "learning_rate": 9.290766358128338e-06, + "loss": 0.4975, + "step": 6921 + }, + { + "epoch": 0.54, + "grad_norm": 1.1169438129695117, + "learning_rate": 9.288260036951774e-06, + "loss": 0.5267, + "step": 6922 + }, + { + "epoch": 0.54, + "grad_norm": 1.2041056216033064, + "learning_rate": 9.285753760711023e-06, + "loss": 0.5532, + "step": 6923 + }, + { + "epoch": 0.54, + "grad_norm": 1.1674193043563388, + "learning_rate": 9.283247529564326e-06, + "loss": 0.5646, + "step": 6924 + }, + { + "epoch": 0.54, + "grad_norm": 1.1768027395955605, + "learning_rate": 9.280741343669908e-06, + "loss": 0.545, + "step": 6925 + }, + { + "epoch": 0.54, + "grad_norm": 1.081877494381608, + "learning_rate": 9.278235203185999e-06, + "loss": 0.4693, + "step": 6926 + }, + { + "epoch": 0.54, + "grad_norm": 1.1036741969862582, + "learning_rate": 9.275729108270825e-06, + "loss": 0.5205, + "step": 6927 + }, + { + "epoch": 0.54, + "grad_norm": 1.1665832058254102, + "learning_rate": 9.27322305908261e-06, + "loss": 0.4809, + "step": 6928 + }, + { + "epoch": 0.54, + "grad_norm": 1.2345403246684257, + "learning_rate": 9.270717055779575e-06, + "loss": 0.5734, + "step": 6929 + }, + { + "epoch": 0.54, + "grad_norm": 1.1085644837079052, + "learning_rate": 9.268211098519932e-06, + "loss": 0.4978, + "step": 6930 + }, + { + "epoch": 0.54, + "grad_norm": 1.0934605351649394, + "learning_rate": 9.265705187461898e-06, + "loss": 0.5376, + "step": 6931 + }, + { + "epoch": 0.54, + "grad_norm": 1.117967133484211, + "learning_rate": 9.263199322763687e-06, + "loss": 0.5437, + "step": 6932 + }, + { + "epoch": 0.54, + "grad_norm": 1.1633346125223145, + "learning_rate": 9.2606935045835e-06, + "loss": 0.5395, + "step": 6933 + }, + { + "epoch": 0.54, + "grad_norm": 1.2807320268849065, + "learning_rate": 9.258187733079552e-06, + "loss": 0.5728, + "step": 6934 + }, + { + "epoch": 0.54, + "grad_norm": 1.1802597173877796, + "learning_rate": 9.255682008410034e-06, + "loss": 0.5462, + "step": 6935 + }, + { + "epoch": 0.54, + "grad_norm": 1.0465943828657278, + "learning_rate": 9.253176330733153e-06, + "loss": 0.4721, + "step": 6936 + }, + { + "epoch": 0.54, + "grad_norm": 1.0977897698589962, + "learning_rate": 9.250670700207103e-06, + "loss": 0.4856, + "step": 6937 + }, + { + "epoch": 0.54, + "grad_norm": 1.1913492658151363, + "learning_rate": 9.248165116990078e-06, + "loss": 0.5302, + "step": 6938 + }, + { + "epoch": 0.54, + "grad_norm": 1.2471735470191296, + "learning_rate": 9.245659581240271e-06, + "loss": 0.5617, + "step": 6939 + }, + { + "epoch": 0.54, + "grad_norm": 1.0122175128723463, + "learning_rate": 9.243154093115862e-06, + "loss": 0.4811, + "step": 6940 + }, + { + "epoch": 0.54, + "grad_norm": 1.1170565121561231, + "learning_rate": 9.240648652775042e-06, + "loss": 0.5079, + "step": 6941 + }, + { + "epoch": 0.54, + "grad_norm": 1.2224205691816439, + "learning_rate": 9.23814326037599e-06, + "loss": 0.5465, + "step": 6942 + }, + { + "epoch": 0.54, + "grad_norm": 1.1123657209817601, + "learning_rate": 9.235637916076884e-06, + "loss": 0.5586, + "step": 6943 + }, + { + "epoch": 0.54, + "grad_norm": 1.0958176823261065, + "learning_rate": 9.233132620035899e-06, + "loss": 0.5004, + "step": 6944 + }, + { + "epoch": 0.54, + "grad_norm": 1.1805004830074655, + "learning_rate": 9.230627372411213e-06, + "loss": 0.5226, + "step": 6945 + }, + { + "epoch": 0.54, + "grad_norm": 1.1487981009556634, + "learning_rate": 9.228122173360986e-06, + "loss": 0.4874, + "step": 6946 + }, + { + "epoch": 0.54, + "grad_norm": 1.1204726462508574, + "learning_rate": 9.22561702304339e-06, + "loss": 0.5522, + "step": 6947 + }, + { + "epoch": 0.54, + "grad_norm": 1.189242941039336, + "learning_rate": 9.223111921616585e-06, + "loss": 0.5627, + "step": 6948 + }, + { + "epoch": 0.54, + "grad_norm": 1.184960108540278, + "learning_rate": 9.220606869238733e-06, + "loss": 0.5342, + "step": 6949 + }, + { + "epoch": 0.54, + "grad_norm": 1.2431689529742278, + "learning_rate": 9.218101866067995e-06, + "loss": 0.5962, + "step": 6950 + }, + { + "epoch": 0.54, + "grad_norm": 1.1778757696852609, + "learning_rate": 9.215596912262515e-06, + "loss": 0.5381, + "step": 6951 + }, + { + "epoch": 0.54, + "grad_norm": 1.2069500645861622, + "learning_rate": 9.213092007980453e-06, + "loss": 0.5376, + "step": 6952 + }, + { + "epoch": 0.54, + "grad_norm": 1.151024324910753, + "learning_rate": 9.21058715337995e-06, + "loss": 0.5071, + "step": 6953 + }, + { + "epoch": 0.54, + "grad_norm": 1.0645189177452357, + "learning_rate": 9.208082348619149e-06, + "loss": 0.5426, + "step": 6954 + }, + { + "epoch": 0.54, + "grad_norm": 1.1235549440925061, + "learning_rate": 9.205577593856203e-06, + "loss": 0.5517, + "step": 6955 + }, + { + "epoch": 0.54, + "grad_norm": 1.1496807297893243, + "learning_rate": 9.203072889249237e-06, + "loss": 0.5198, + "step": 6956 + }, + { + "epoch": 0.54, + "grad_norm": 1.0767754110292076, + "learning_rate": 9.200568234956393e-06, + "loss": 0.5457, + "step": 6957 + }, + { + "epoch": 0.54, + "grad_norm": 1.1517957515849504, + "learning_rate": 9.198063631135799e-06, + "loss": 0.5566, + "step": 6958 + }, + { + "epoch": 0.54, + "grad_norm": 1.173953184552595, + "learning_rate": 9.195559077945586e-06, + "loss": 0.564, + "step": 6959 + }, + { + "epoch": 0.54, + "grad_norm": 1.221985215389513, + "learning_rate": 9.193054575543883e-06, + "loss": 0.5874, + "step": 6960 + }, + { + "epoch": 0.54, + "grad_norm": 1.2115907445276382, + "learning_rate": 9.190550124088802e-06, + "loss": 0.5805, + "step": 6961 + }, + { + "epoch": 0.54, + "grad_norm": 1.2395495348684942, + "learning_rate": 9.188045723738471e-06, + "loss": 0.5715, + "step": 6962 + }, + { + "epoch": 0.54, + "grad_norm": 1.0463720793240696, + "learning_rate": 9.185541374651001e-06, + "loss": 0.5656, + "step": 6963 + }, + { + "epoch": 0.54, + "grad_norm": 1.2738552227489512, + "learning_rate": 9.183037076984505e-06, + "loss": 0.5581, + "step": 6964 + }, + { + "epoch": 0.54, + "grad_norm": 1.1700438370602617, + "learning_rate": 9.180532830897099e-06, + "loss": 0.5691, + "step": 6965 + }, + { + "epoch": 0.54, + "grad_norm": 1.1466987058595464, + "learning_rate": 9.178028636546879e-06, + "loss": 0.5725, + "step": 6966 + }, + { + "epoch": 0.54, + "grad_norm": 1.0806897933712363, + "learning_rate": 9.175524494091951e-06, + "loss": 0.5095, + "step": 6967 + }, + { + "epoch": 0.54, + "grad_norm": 1.10736918113791, + "learning_rate": 9.173020403690417e-06, + "loss": 0.502, + "step": 6968 + }, + { + "epoch": 0.54, + "grad_norm": 1.1199586551562242, + "learning_rate": 9.17051636550037e-06, + "loss": 0.5631, + "step": 6969 + }, + { + "epoch": 0.54, + "grad_norm": 1.0493294141272902, + "learning_rate": 9.168012379679909e-06, + "loss": 0.5044, + "step": 6970 + }, + { + "epoch": 0.54, + "grad_norm": 1.2162419457477667, + "learning_rate": 9.165508446387114e-06, + "loss": 0.5426, + "step": 6971 + }, + { + "epoch": 0.54, + "grad_norm": 1.2508817900374918, + "learning_rate": 9.16300456578008e-06, + "loss": 0.571, + "step": 6972 + }, + { + "epoch": 0.54, + "grad_norm": 1.1812503390841527, + "learning_rate": 9.160500738016883e-06, + "loss": 0.5793, + "step": 6973 + }, + { + "epoch": 0.54, + "grad_norm": 1.274405737371036, + "learning_rate": 9.157996963255605e-06, + "loss": 0.5297, + "step": 6974 + }, + { + "epoch": 0.54, + "grad_norm": 1.009623061839108, + "learning_rate": 9.155493241654327e-06, + "loss": 0.4643, + "step": 6975 + }, + { + "epoch": 0.54, + "grad_norm": 1.2002514794214814, + "learning_rate": 9.152989573371115e-06, + "loss": 0.5486, + "step": 6976 + }, + { + "epoch": 0.54, + "grad_norm": 1.2772244272290916, + "learning_rate": 9.150485958564043e-06, + "loss": 0.603, + "step": 6977 + }, + { + "epoch": 0.54, + "grad_norm": 1.1033836623426903, + "learning_rate": 9.147982397391172e-06, + "loss": 0.5454, + "step": 6978 + }, + { + "epoch": 0.54, + "grad_norm": 1.2115514367960964, + "learning_rate": 9.145478890010568e-06, + "loss": 0.571, + "step": 6979 + }, + { + "epoch": 0.54, + "grad_norm": 1.2421910987657923, + "learning_rate": 9.142975436580297e-06, + "loss": 0.5611, + "step": 6980 + }, + { + "epoch": 0.54, + "grad_norm": 1.1870311514214298, + "learning_rate": 9.140472037258403e-06, + "loss": 0.5261, + "step": 6981 + }, + { + "epoch": 0.54, + "grad_norm": 1.1489866850955477, + "learning_rate": 9.137968692202945e-06, + "loss": 0.5315, + "step": 6982 + }, + { + "epoch": 0.54, + "grad_norm": 1.0719251526423996, + "learning_rate": 9.13546540157197e-06, + "loss": 0.5379, + "step": 6983 + }, + { + "epoch": 0.54, + "grad_norm": 1.3033732116107446, + "learning_rate": 9.132962165523523e-06, + "loss": 0.5357, + "step": 6984 + }, + { + "epoch": 0.54, + "grad_norm": 1.210522537503795, + "learning_rate": 9.130458984215652e-06, + "loss": 0.5308, + "step": 6985 + }, + { + "epoch": 0.54, + "grad_norm": 1.2480797800612613, + "learning_rate": 9.12795585780639e-06, + "loss": 0.5878, + "step": 6986 + }, + { + "epoch": 0.54, + "grad_norm": 1.14679730643311, + "learning_rate": 9.125452786453774e-06, + "loss": 0.5225, + "step": 6987 + }, + { + "epoch": 0.54, + "grad_norm": 1.221749307553021, + "learning_rate": 9.122949770315834e-06, + "loss": 0.5507, + "step": 6988 + }, + { + "epoch": 0.54, + "grad_norm": 1.3355400431152264, + "learning_rate": 9.1204468095506e-06, + "loss": 0.6395, + "step": 6989 + }, + { + "epoch": 0.54, + "grad_norm": 1.282097931312729, + "learning_rate": 9.1179439043161e-06, + "loss": 0.5805, + "step": 6990 + }, + { + "epoch": 0.54, + "grad_norm": 1.1581507859136013, + "learning_rate": 9.11544105477035e-06, + "loss": 0.5557, + "step": 6991 + }, + { + "epoch": 0.54, + "grad_norm": 1.2234400470263787, + "learning_rate": 9.11293826107137e-06, + "loss": 0.6168, + "step": 6992 + }, + { + "epoch": 0.54, + "grad_norm": 1.1379268034170729, + "learning_rate": 9.110435523377173e-06, + "loss": 0.5406, + "step": 6993 + }, + { + "epoch": 0.54, + "grad_norm": 1.0996749614446293, + "learning_rate": 9.10793284184577e-06, + "loss": 0.5462, + "step": 6994 + }, + { + "epoch": 0.54, + "grad_norm": 1.0379292898330597, + "learning_rate": 9.105430216635174e-06, + "loss": 0.5362, + "step": 6995 + }, + { + "epoch": 0.54, + "grad_norm": 1.1427434817912512, + "learning_rate": 9.102927647903382e-06, + "loss": 0.5539, + "step": 6996 + }, + { + "epoch": 0.54, + "grad_norm": 1.1408329081807391, + "learning_rate": 9.100425135808396e-06, + "loss": 0.5236, + "step": 6997 + }, + { + "epoch": 0.54, + "grad_norm": 1.0264476740935105, + "learning_rate": 9.09792268050821e-06, + "loss": 0.4896, + "step": 6998 + }, + { + "epoch": 0.54, + "grad_norm": 1.1323937003666193, + "learning_rate": 9.09542028216082e-06, + "loss": 0.487, + "step": 6999 + }, + { + "epoch": 0.54, + "grad_norm": 1.1424002745362227, + "learning_rate": 9.092917940924216e-06, + "loss": 0.5429, + "step": 7000 + }, + { + "epoch": 0.54, + "grad_norm": 1.4034000764289825, + "learning_rate": 9.090415656956382e-06, + "loss": 0.5719, + "step": 7001 + }, + { + "epoch": 0.54, + "grad_norm": 1.0883032189292592, + "learning_rate": 9.0879134304153e-06, + "loss": 0.5291, + "step": 7002 + }, + { + "epoch": 0.54, + "grad_norm": 1.1988742594513302, + "learning_rate": 9.085411261458948e-06, + "loss": 0.554, + "step": 7003 + }, + { + "epoch": 0.54, + "grad_norm": 1.155488149855366, + "learning_rate": 9.082909150245302e-06, + "loss": 0.5646, + "step": 7004 + }, + { + "epoch": 0.54, + "grad_norm": 1.2430118248822817, + "learning_rate": 9.080407096932334e-06, + "loss": 0.5543, + "step": 7005 + }, + { + "epoch": 0.54, + "grad_norm": 1.143835331386233, + "learning_rate": 9.077905101678011e-06, + "loss": 0.5131, + "step": 7006 + }, + { + "epoch": 0.54, + "grad_norm": 1.1471499017442481, + "learning_rate": 9.075403164640296e-06, + "loss": 0.5024, + "step": 7007 + }, + { + "epoch": 0.54, + "grad_norm": 1.138680826089791, + "learning_rate": 9.072901285977148e-06, + "loss": 0.52, + "step": 7008 + }, + { + "epoch": 0.54, + "grad_norm": 1.2051025585992894, + "learning_rate": 9.070399465846524e-06, + "loss": 0.5342, + "step": 7009 + }, + { + "epoch": 0.54, + "grad_norm": 1.1450505038954624, + "learning_rate": 9.06789770440638e-06, + "loss": 0.5494, + "step": 7010 + }, + { + "epoch": 0.54, + "grad_norm": 1.1068477588922272, + "learning_rate": 9.065396001814665e-06, + "loss": 0.5358, + "step": 7011 + }, + { + "epoch": 0.54, + "grad_norm": 1.2966130003492022, + "learning_rate": 9.06289435822932e-06, + "loss": 0.6066, + "step": 7012 + }, + { + "epoch": 0.54, + "grad_norm": 1.228411889965284, + "learning_rate": 9.06039277380829e-06, + "loss": 0.5751, + "step": 7013 + }, + { + "epoch": 0.54, + "grad_norm": 1.0887813517176645, + "learning_rate": 9.057891248709508e-06, + "loss": 0.5133, + "step": 7014 + }, + { + "epoch": 0.54, + "grad_norm": 1.391514911296203, + "learning_rate": 9.055389783090916e-06, + "loss": 0.5703, + "step": 7015 + }, + { + "epoch": 0.54, + "grad_norm": 1.1722422723627295, + "learning_rate": 9.05288837711044e-06, + "loss": 0.532, + "step": 7016 + }, + { + "epoch": 0.54, + "grad_norm": 1.085709692023892, + "learning_rate": 9.050387030926008e-06, + "loss": 0.5014, + "step": 7017 + }, + { + "epoch": 0.54, + "grad_norm": 1.1962447177686633, + "learning_rate": 9.047885744695539e-06, + "loss": 0.5419, + "step": 7018 + }, + { + "epoch": 0.54, + "grad_norm": 1.1480357706597895, + "learning_rate": 9.045384518576956e-06, + "loss": 0.5024, + "step": 7019 + }, + { + "epoch": 0.54, + "grad_norm": 1.20420491686902, + "learning_rate": 9.042883352728176e-06, + "loss": 0.5586, + "step": 7020 + }, + { + "epoch": 0.54, + "grad_norm": 1.1029276944252733, + "learning_rate": 9.040382247307107e-06, + "loss": 0.5225, + "step": 7021 + }, + { + "epoch": 0.54, + "grad_norm": 1.311248409608471, + "learning_rate": 9.037881202471657e-06, + "loss": 0.6048, + "step": 7022 + }, + { + "epoch": 0.54, + "grad_norm": 1.1548144733989967, + "learning_rate": 9.03538021837973e-06, + "loss": 0.5379, + "step": 7023 + }, + { + "epoch": 0.54, + "grad_norm": 1.1492036612302654, + "learning_rate": 9.032879295189226e-06, + "loss": 0.5451, + "step": 7024 + }, + { + "epoch": 0.54, + "grad_norm": 1.2538196854702794, + "learning_rate": 9.030378433058045e-06, + "loss": 0.6014, + "step": 7025 + }, + { + "epoch": 0.55, + "grad_norm": 1.1552907212061672, + "learning_rate": 9.027877632144076e-06, + "loss": 0.5065, + "step": 7026 + }, + { + "epoch": 0.55, + "grad_norm": 1.1356965282967875, + "learning_rate": 9.025376892605205e-06, + "loss": 0.5296, + "step": 7027 + }, + { + "epoch": 0.55, + "grad_norm": 1.2273073303918798, + "learning_rate": 9.02287621459932e-06, + "loss": 0.5217, + "step": 7028 + }, + { + "epoch": 0.55, + "grad_norm": 1.1045813351649905, + "learning_rate": 9.0203755982843e-06, + "loss": 0.5463, + "step": 7029 + }, + { + "epoch": 0.55, + "grad_norm": 1.2339777729839565, + "learning_rate": 9.017875043818024e-06, + "loss": 0.5377, + "step": 7030 + }, + { + "epoch": 0.55, + "grad_norm": 1.0785493982892813, + "learning_rate": 9.015374551358364e-06, + "loss": 0.5226, + "step": 7031 + }, + { + "epoch": 0.55, + "grad_norm": 1.1916739741459232, + "learning_rate": 9.012874121063189e-06, + "loss": 0.5848, + "step": 7032 + }, + { + "epoch": 0.55, + "grad_norm": 1.204853896389094, + "learning_rate": 9.01037375309036e-06, + "loss": 0.5662, + "step": 7033 + }, + { + "epoch": 0.55, + "grad_norm": 1.2784209302874183, + "learning_rate": 9.007873447597744e-06, + "loss": 0.5707, + "step": 7034 + }, + { + "epoch": 0.55, + "grad_norm": 1.143587107080752, + "learning_rate": 9.005373204743196e-06, + "loss": 0.5527, + "step": 7035 + }, + { + "epoch": 0.55, + "grad_norm": 1.0765289987962836, + "learning_rate": 9.00287302468457e-06, + "loss": 0.4727, + "step": 7036 + }, + { + "epoch": 0.55, + "grad_norm": 1.1779686234607318, + "learning_rate": 9.000372907579716e-06, + "loss": 0.5093, + "step": 7037 + }, + { + "epoch": 0.55, + "grad_norm": 1.199891687511814, + "learning_rate": 8.997872853586474e-06, + "loss": 0.5887, + "step": 7038 + }, + { + "epoch": 0.55, + "grad_norm": 1.1354772860708937, + "learning_rate": 8.995372862862687e-06, + "loss": 0.4775, + "step": 7039 + }, + { + "epoch": 0.55, + "grad_norm": 1.1514607812892144, + "learning_rate": 8.9928729355662e-06, + "loss": 0.5055, + "step": 7040 + }, + { + "epoch": 0.55, + "grad_norm": 1.2774361393264921, + "learning_rate": 8.990373071854842e-06, + "loss": 0.6212, + "step": 7041 + }, + { + "epoch": 0.55, + "grad_norm": 1.2539147112314308, + "learning_rate": 8.987873271886436e-06, + "loss": 0.567, + "step": 7042 + }, + { + "epoch": 0.55, + "grad_norm": 1.0432130334641108, + "learning_rate": 8.985373535818814e-06, + "loss": 0.5036, + "step": 7043 + }, + { + "epoch": 0.55, + "grad_norm": 1.2721665950248542, + "learning_rate": 8.982873863809793e-06, + "loss": 0.5396, + "step": 7044 + }, + { + "epoch": 0.55, + "grad_norm": 1.1076160509604467, + "learning_rate": 8.980374256017196e-06, + "loss": 0.4962, + "step": 7045 + }, + { + "epoch": 0.55, + "grad_norm": 1.2018725824753307, + "learning_rate": 8.977874712598833e-06, + "loss": 0.5538, + "step": 7046 + }, + { + "epoch": 0.55, + "grad_norm": 1.137887360567805, + "learning_rate": 8.975375233712511e-06, + "loss": 0.5593, + "step": 7047 + }, + { + "epoch": 0.55, + "grad_norm": 1.1979539395490664, + "learning_rate": 8.972875819516037e-06, + "loss": 0.5256, + "step": 7048 + }, + { + "epoch": 0.55, + "grad_norm": 1.271288269202131, + "learning_rate": 8.97037647016721e-06, + "loss": 0.56, + "step": 7049 + }, + { + "epoch": 0.55, + "grad_norm": 1.1684079062261288, + "learning_rate": 8.967877185823833e-06, + "loss": 0.5269, + "step": 7050 + }, + { + "epoch": 0.55, + "grad_norm": 1.1487415975070363, + "learning_rate": 8.96537796664369e-06, + "loss": 0.5491, + "step": 7051 + }, + { + "epoch": 0.55, + "grad_norm": 1.0950324578025312, + "learning_rate": 8.96287881278458e-06, + "loss": 0.5636, + "step": 7052 + }, + { + "epoch": 0.55, + "grad_norm": 1.1393221377415492, + "learning_rate": 8.960379724404275e-06, + "loss": 0.5054, + "step": 7053 + }, + { + "epoch": 0.55, + "grad_norm": 1.2108883755471758, + "learning_rate": 8.957880701660563e-06, + "loss": 0.5465, + "step": 7054 + }, + { + "epoch": 0.55, + "grad_norm": 1.2598750577015054, + "learning_rate": 8.955381744711222e-06, + "loss": 0.5956, + "step": 7055 + }, + { + "epoch": 0.55, + "grad_norm": 1.2598406155762636, + "learning_rate": 8.952882853714017e-06, + "loss": 0.5896, + "step": 7056 + }, + { + "epoch": 0.55, + "grad_norm": 1.1970730948793176, + "learning_rate": 8.950384028826724e-06, + "loss": 0.56, + "step": 7057 + }, + { + "epoch": 0.55, + "grad_norm": 1.226250432643367, + "learning_rate": 8.9478852702071e-06, + "loss": 0.6067, + "step": 7058 + }, + { + "epoch": 0.55, + "grad_norm": 1.1200959553656014, + "learning_rate": 8.945386578012906e-06, + "loss": 0.532, + "step": 7059 + }, + { + "epoch": 0.55, + "grad_norm": 1.0510084622711193, + "learning_rate": 8.942887952401901e-06, + "loss": 0.4692, + "step": 7060 + }, + { + "epoch": 0.55, + "grad_norm": 1.1454068632921695, + "learning_rate": 8.94038939353183e-06, + "loss": 0.5528, + "step": 7061 + }, + { + "epoch": 0.55, + "grad_norm": 1.1907246124969835, + "learning_rate": 8.937890901560452e-06, + "loss": 0.5423, + "step": 7062 + }, + { + "epoch": 0.55, + "grad_norm": 1.2802901045051178, + "learning_rate": 8.935392476645493e-06, + "loss": 0.5969, + "step": 7063 + }, + { + "epoch": 0.55, + "grad_norm": 1.1649678952782836, + "learning_rate": 8.932894118944699e-06, + "loss": 0.5564, + "step": 7064 + }, + { + "epoch": 0.55, + "grad_norm": 1.1245136269252156, + "learning_rate": 8.930395828615808e-06, + "loss": 0.5204, + "step": 7065 + }, + { + "epoch": 0.55, + "grad_norm": 1.158987413382461, + "learning_rate": 8.927897605816546e-06, + "loss": 0.5468, + "step": 7066 + }, + { + "epoch": 0.55, + "grad_norm": 1.1992520007764307, + "learning_rate": 8.925399450704642e-06, + "loss": 0.5747, + "step": 7067 + }, + { + "epoch": 0.55, + "grad_norm": 1.0992629486286078, + "learning_rate": 8.922901363437812e-06, + "loss": 0.5121, + "step": 7068 + }, + { + "epoch": 0.55, + "grad_norm": 1.087809205218736, + "learning_rate": 8.920403344173776e-06, + "loss": 0.5131, + "step": 7069 + }, + { + "epoch": 0.55, + "grad_norm": 1.15644850186857, + "learning_rate": 8.917905393070249e-06, + "loss": 0.4964, + "step": 7070 + }, + { + "epoch": 0.55, + "grad_norm": 1.1522958519242956, + "learning_rate": 8.915407510284933e-06, + "loss": 0.4904, + "step": 7071 + }, + { + "epoch": 0.55, + "grad_norm": 1.0675906915130946, + "learning_rate": 8.912909695975543e-06, + "loss": 0.561, + "step": 7072 + }, + { + "epoch": 0.55, + "grad_norm": 1.1408533886655317, + "learning_rate": 8.910411950299769e-06, + "loss": 0.4897, + "step": 7073 + }, + { + "epoch": 0.55, + "grad_norm": 1.0958324770809256, + "learning_rate": 8.907914273415308e-06, + "loss": 0.527, + "step": 7074 + }, + { + "epoch": 0.55, + "grad_norm": 1.2821511146910982, + "learning_rate": 8.905416665479856e-06, + "loss": 0.5092, + "step": 7075 + }, + { + "epoch": 0.55, + "grad_norm": 1.2290909091858149, + "learning_rate": 8.902919126651096e-06, + "loss": 0.5701, + "step": 7076 + }, + { + "epoch": 0.55, + "grad_norm": 1.2122036129224039, + "learning_rate": 8.900421657086716e-06, + "loss": 0.5234, + "step": 7077 + }, + { + "epoch": 0.55, + "grad_norm": 1.1998593983137111, + "learning_rate": 8.897924256944386e-06, + "loss": 0.5974, + "step": 7078 + }, + { + "epoch": 0.55, + "grad_norm": 1.1215766637571554, + "learning_rate": 8.895426926381782e-06, + "loss": 0.5151, + "step": 7079 + }, + { + "epoch": 0.55, + "grad_norm": 1.086803447787699, + "learning_rate": 8.892929665556577e-06, + "loss": 0.5216, + "step": 7080 + }, + { + "epoch": 0.55, + "grad_norm": 1.0911354968811233, + "learning_rate": 8.890432474626433e-06, + "loss": 0.4995, + "step": 7081 + }, + { + "epoch": 0.55, + "grad_norm": 1.1227341722431547, + "learning_rate": 8.887935353749017e-06, + "loss": 0.5727, + "step": 7082 + }, + { + "epoch": 0.55, + "grad_norm": 1.3099574939733143, + "learning_rate": 8.885438303081972e-06, + "loss": 0.5852, + "step": 7083 + }, + { + "epoch": 0.55, + "grad_norm": 1.2274453454368328, + "learning_rate": 8.88294132278296e-06, + "loss": 0.5978, + "step": 7084 + }, + { + "epoch": 0.55, + "grad_norm": 1.1841064450608139, + "learning_rate": 8.880444413009627e-06, + "loss": 0.5602, + "step": 7085 + }, + { + "epoch": 0.55, + "grad_norm": 1.3034456933330383, + "learning_rate": 8.877947573919612e-06, + "loss": 0.5742, + "step": 7086 + }, + { + "epoch": 0.55, + "grad_norm": 1.2213081020406638, + "learning_rate": 8.87545080567056e-06, + "loss": 0.5324, + "step": 7087 + }, + { + "epoch": 0.55, + "grad_norm": 1.2171063221022014, + "learning_rate": 8.872954108420096e-06, + "loss": 0.5474, + "step": 7088 + }, + { + "epoch": 0.55, + "grad_norm": 1.0927276738106495, + "learning_rate": 8.870457482325854e-06, + "loss": 0.534, + "step": 7089 + }, + { + "epoch": 0.55, + "grad_norm": 1.1417210159916185, + "learning_rate": 8.867960927545461e-06, + "loss": 0.52, + "step": 7090 + }, + { + "epoch": 0.55, + "grad_norm": 1.1728181731007823, + "learning_rate": 8.865464444236534e-06, + "loss": 0.5447, + "step": 7091 + }, + { + "epoch": 0.55, + "grad_norm": 1.076641554898837, + "learning_rate": 8.862968032556694e-06, + "loss": 0.4897, + "step": 7092 + }, + { + "epoch": 0.55, + "grad_norm": 1.1660198166866136, + "learning_rate": 8.860471692663542e-06, + "loss": 0.5194, + "step": 7093 + }, + { + "epoch": 0.55, + "grad_norm": 1.1576782375229382, + "learning_rate": 8.857975424714694e-06, + "loss": 0.5384, + "step": 7094 + }, + { + "epoch": 0.55, + "grad_norm": 1.2249639019221212, + "learning_rate": 8.855479228867751e-06, + "loss": 0.5313, + "step": 7095 + }, + { + "epoch": 0.55, + "grad_norm": 1.208790999371075, + "learning_rate": 8.852983105280307e-06, + "loss": 0.5283, + "step": 7096 + }, + { + "epoch": 0.55, + "grad_norm": 1.0993255194653686, + "learning_rate": 8.850487054109961e-06, + "loss": 0.5268, + "step": 7097 + }, + { + "epoch": 0.55, + "grad_norm": 1.1716331232314943, + "learning_rate": 8.847991075514294e-06, + "loss": 0.5317, + "step": 7098 + }, + { + "epoch": 0.55, + "grad_norm": 1.1810950162426344, + "learning_rate": 8.845495169650894e-06, + "loss": 0.5561, + "step": 7099 + }, + { + "epoch": 0.55, + "grad_norm": 1.268011786011996, + "learning_rate": 8.842999336677342e-06, + "loss": 0.591, + "step": 7100 + }, + { + "epoch": 0.55, + "grad_norm": 1.239070171702857, + "learning_rate": 8.84050357675121e-06, + "loss": 0.5664, + "step": 7101 + }, + { + "epoch": 0.55, + "grad_norm": 1.2000389152415756, + "learning_rate": 8.838007890030074e-06, + "loss": 0.5522, + "step": 7102 + }, + { + "epoch": 0.55, + "grad_norm": 1.1787176124111955, + "learning_rate": 8.83551227667149e-06, + "loss": 0.5402, + "step": 7103 + }, + { + "epoch": 0.55, + "grad_norm": 1.2485091856044537, + "learning_rate": 8.833016736833023e-06, + "loss": 0.5624, + "step": 7104 + }, + { + "epoch": 0.55, + "grad_norm": 1.257974507578442, + "learning_rate": 8.830521270672233e-06, + "loss": 0.5433, + "step": 7105 + }, + { + "epoch": 0.55, + "grad_norm": 1.1308969020381792, + "learning_rate": 8.828025878346667e-06, + "loss": 0.5292, + "step": 7106 + }, + { + "epoch": 0.55, + "grad_norm": 1.1160898218001667, + "learning_rate": 8.825530560013876e-06, + "loss": 0.4983, + "step": 7107 + }, + { + "epoch": 0.55, + "grad_norm": 1.1349720857355323, + "learning_rate": 8.8230353158314e-06, + "loss": 0.578, + "step": 7108 + }, + { + "epoch": 0.55, + "grad_norm": 1.091982475978547, + "learning_rate": 8.820540145956776e-06, + "loss": 0.465, + "step": 7109 + }, + { + "epoch": 0.55, + "grad_norm": 1.2091692904797267, + "learning_rate": 8.818045050547539e-06, + "loss": 0.5271, + "step": 7110 + }, + { + "epoch": 0.55, + "grad_norm": 1.1360981613784953, + "learning_rate": 8.815550029761215e-06, + "loss": 0.5331, + "step": 7111 + }, + { + "epoch": 0.55, + "grad_norm": 1.138158090142865, + "learning_rate": 8.813055083755327e-06, + "loss": 0.5017, + "step": 7112 + }, + { + "epoch": 0.55, + "grad_norm": 1.1919370875717665, + "learning_rate": 8.8105602126874e-06, + "loss": 0.5528, + "step": 7113 + }, + { + "epoch": 0.55, + "grad_norm": 1.064462924177229, + "learning_rate": 8.808065416714938e-06, + "loss": 0.4861, + "step": 7114 + }, + { + "epoch": 0.55, + "grad_norm": 1.1121106334340165, + "learning_rate": 8.805570695995462e-06, + "loss": 0.4776, + "step": 7115 + }, + { + "epoch": 0.55, + "grad_norm": 1.1453971841893755, + "learning_rate": 8.803076050686465e-06, + "loss": 0.542, + "step": 7116 + }, + { + "epoch": 0.55, + "grad_norm": 1.1840272116014923, + "learning_rate": 8.800581480945453e-06, + "loss": 0.5305, + "step": 7117 + }, + { + "epoch": 0.55, + "grad_norm": 1.3058131952361969, + "learning_rate": 8.798086986929923e-06, + "loss": 0.5811, + "step": 7118 + }, + { + "epoch": 0.55, + "grad_norm": 1.1610344539527588, + "learning_rate": 8.795592568797362e-06, + "loss": 0.5349, + "step": 7119 + }, + { + "epoch": 0.55, + "grad_norm": 1.1838873571444342, + "learning_rate": 8.793098226705255e-06, + "loss": 0.5324, + "step": 7120 + }, + { + "epoch": 0.55, + "grad_norm": 1.2025679252299515, + "learning_rate": 8.790603960811082e-06, + "loss": 0.5428, + "step": 7121 + }, + { + "epoch": 0.55, + "grad_norm": 1.1145408063719777, + "learning_rate": 8.78810977127232e-06, + "loss": 0.5201, + "step": 7122 + }, + { + "epoch": 0.55, + "grad_norm": 1.1309701078024728, + "learning_rate": 8.785615658246448e-06, + "loss": 0.5456, + "step": 7123 + }, + { + "epoch": 0.55, + "grad_norm": 1.1486368330710166, + "learning_rate": 8.783121621890917e-06, + "loss": 0.5575, + "step": 7124 + }, + { + "epoch": 0.55, + "grad_norm": 1.256711584666228, + "learning_rate": 8.780627662363201e-06, + "loss": 0.58, + "step": 7125 + }, + { + "epoch": 0.55, + "grad_norm": 1.1477685134647573, + "learning_rate": 8.77813377982075e-06, + "loss": 0.5745, + "step": 7126 + }, + { + "epoch": 0.55, + "grad_norm": 1.1382409355025513, + "learning_rate": 8.775639974421015e-06, + "loss": 0.596, + "step": 7127 + }, + { + "epoch": 0.55, + "grad_norm": 1.1452729617758346, + "learning_rate": 8.773146246321452e-06, + "loss": 0.5872, + "step": 7128 + }, + { + "epoch": 0.55, + "grad_norm": 1.1428474304518808, + "learning_rate": 8.77065259567949e-06, + "loss": 0.547, + "step": 7129 + }, + { + "epoch": 0.55, + "grad_norm": 1.28632688687355, + "learning_rate": 8.768159022652574e-06, + "loss": 0.6348, + "step": 7130 + }, + { + "epoch": 0.55, + "grad_norm": 1.2055499422193672, + "learning_rate": 8.765665527398133e-06, + "loss": 0.5535, + "step": 7131 + }, + { + "epoch": 0.55, + "grad_norm": 1.0953780455302633, + "learning_rate": 8.763172110073596e-06, + "loss": 0.5218, + "step": 7132 + }, + { + "epoch": 0.55, + "grad_norm": 1.0788903215186887, + "learning_rate": 8.760678770836391e-06, + "loss": 0.519, + "step": 7133 + }, + { + "epoch": 0.55, + "grad_norm": 1.1224003000310658, + "learning_rate": 8.758185509843921e-06, + "loss": 0.4871, + "step": 7134 + }, + { + "epoch": 0.55, + "grad_norm": 1.0695437155443668, + "learning_rate": 8.755692327253612e-06, + "loss": 0.5097, + "step": 7135 + }, + { + "epoch": 0.55, + "grad_norm": 1.1795728008770674, + "learning_rate": 8.753199223222863e-06, + "loss": 0.5573, + "step": 7136 + }, + { + "epoch": 0.55, + "grad_norm": 1.2191648510814888, + "learning_rate": 8.75070619790908e-06, + "loss": 0.5931, + "step": 7137 + }, + { + "epoch": 0.55, + "grad_norm": 1.1634564454172729, + "learning_rate": 8.748213251469663e-06, + "loss": 0.5397, + "step": 7138 + }, + { + "epoch": 0.55, + "grad_norm": 1.1976217266165836, + "learning_rate": 8.745720384062002e-06, + "loss": 0.5591, + "step": 7139 + }, + { + "epoch": 0.55, + "grad_norm": 1.046246183199284, + "learning_rate": 8.74322759584348e-06, + "loss": 0.4605, + "step": 7140 + }, + { + "epoch": 0.55, + "grad_norm": 1.1382486855817477, + "learning_rate": 8.740734886971485e-06, + "loss": 0.528, + "step": 7141 + }, + { + "epoch": 0.55, + "grad_norm": 1.1312567189053966, + "learning_rate": 8.738242257603394e-06, + "loss": 0.5534, + "step": 7142 + }, + { + "epoch": 0.55, + "grad_norm": 1.0714343797435453, + "learning_rate": 8.735749707896584e-06, + "loss": 0.4893, + "step": 7143 + }, + { + "epoch": 0.55, + "grad_norm": 1.27354393262686, + "learning_rate": 8.733257238008414e-06, + "loss": 0.6157, + "step": 7144 + }, + { + "epoch": 0.55, + "grad_norm": 1.151990519550838, + "learning_rate": 8.730764848096247e-06, + "loss": 0.5211, + "step": 7145 + }, + { + "epoch": 0.55, + "grad_norm": 1.1484824450774165, + "learning_rate": 8.728272538317447e-06, + "loss": 0.5343, + "step": 7146 + }, + { + "epoch": 0.55, + "grad_norm": 1.2753813248159431, + "learning_rate": 8.725780308829358e-06, + "loss": 0.5678, + "step": 7147 + }, + { + "epoch": 0.55, + "grad_norm": 1.0761916475504654, + "learning_rate": 8.72328815978934e-06, + "loss": 0.5449, + "step": 7148 + }, + { + "epoch": 0.55, + "grad_norm": 1.2135464036491619, + "learning_rate": 8.720796091354725e-06, + "loss": 0.536, + "step": 7149 + }, + { + "epoch": 0.55, + "grad_norm": 1.290066830656032, + "learning_rate": 8.71830410368285e-06, + "loss": 0.5861, + "step": 7150 + }, + { + "epoch": 0.55, + "grad_norm": 1.1461821747559466, + "learning_rate": 8.715812196931049e-06, + "loss": 0.5073, + "step": 7151 + }, + { + "epoch": 0.55, + "grad_norm": 1.1372245371197143, + "learning_rate": 8.713320371256651e-06, + "loss": 0.5703, + "step": 7152 + }, + { + "epoch": 0.55, + "grad_norm": 1.227331661405224, + "learning_rate": 8.71082862681698e-06, + "loss": 0.5832, + "step": 7153 + }, + { + "epoch": 0.55, + "grad_norm": 1.2050269810042098, + "learning_rate": 8.708336963769345e-06, + "loss": 0.5622, + "step": 7154 + }, + { + "epoch": 0.56, + "grad_norm": 1.1607089283578502, + "learning_rate": 8.705845382271063e-06, + "loss": 0.5249, + "step": 7155 + }, + { + "epoch": 0.56, + "grad_norm": 1.2006896500902056, + "learning_rate": 8.703353882479436e-06, + "loss": 0.6039, + "step": 7156 + }, + { + "epoch": 0.56, + "grad_norm": 1.1997359958951372, + "learning_rate": 8.70086246455177e-06, + "loss": 0.523, + "step": 7157 + }, + { + "epoch": 0.56, + "grad_norm": 1.149052876755601, + "learning_rate": 8.698371128645364e-06, + "loss": 0.57, + "step": 7158 + }, + { + "epoch": 0.56, + "grad_norm": 1.2437832257610775, + "learning_rate": 8.6958798749175e-06, + "loss": 0.5645, + "step": 7159 + }, + { + "epoch": 0.56, + "grad_norm": 1.2376319314007216, + "learning_rate": 8.693388703525467e-06, + "loss": 0.5769, + "step": 7160 + }, + { + "epoch": 0.56, + "grad_norm": 1.1158212163284837, + "learning_rate": 8.690897614626546e-06, + "loss": 0.5515, + "step": 7161 + }, + { + "epoch": 0.56, + "grad_norm": 1.3077068454810339, + "learning_rate": 8.688406608378012e-06, + "loss": 0.5694, + "step": 7162 + }, + { + "epoch": 0.56, + "grad_norm": 1.1859861562445195, + "learning_rate": 8.685915684937138e-06, + "loss": 0.5276, + "step": 7163 + }, + { + "epoch": 0.56, + "grad_norm": 1.1966382322855935, + "learning_rate": 8.68342484446119e-06, + "loss": 0.5182, + "step": 7164 + }, + { + "epoch": 0.56, + "grad_norm": 1.1723239293101348, + "learning_rate": 8.68093408710742e-06, + "loss": 0.5786, + "step": 7165 + }, + { + "epoch": 0.56, + "grad_norm": 1.2801214225368167, + "learning_rate": 8.678443413033085e-06, + "loss": 0.5472, + "step": 7166 + }, + { + "epoch": 0.56, + "grad_norm": 1.1055663911542983, + "learning_rate": 8.675952822395437e-06, + "loss": 0.4865, + "step": 7167 + }, + { + "epoch": 0.56, + "grad_norm": 1.2392195266569208, + "learning_rate": 8.67346231535172e-06, + "loss": 0.5382, + "step": 7168 + }, + { + "epoch": 0.56, + "grad_norm": 1.1553552103647624, + "learning_rate": 8.670971892059173e-06, + "loss": 0.5446, + "step": 7169 + }, + { + "epoch": 0.56, + "grad_norm": 1.0555646286680438, + "learning_rate": 8.668481552675024e-06, + "loss": 0.4894, + "step": 7170 + }, + { + "epoch": 0.56, + "grad_norm": 1.169164906258231, + "learning_rate": 8.665991297356503e-06, + "loss": 0.5314, + "step": 7171 + }, + { + "epoch": 0.56, + "grad_norm": 1.2251974394366452, + "learning_rate": 8.663501126260836e-06, + "loss": 0.5657, + "step": 7172 + }, + { + "epoch": 0.56, + "grad_norm": 1.0571001502656456, + "learning_rate": 8.661011039545238e-06, + "loss": 0.5271, + "step": 7173 + }, + { + "epoch": 0.56, + "grad_norm": 1.1893826668953762, + "learning_rate": 8.658521037366926e-06, + "loss": 0.5124, + "step": 7174 + }, + { + "epoch": 0.56, + "grad_norm": 1.0664217322486766, + "learning_rate": 8.656031119883095e-06, + "loss": 0.5068, + "step": 7175 + }, + { + "epoch": 0.56, + "grad_norm": 1.2050477059604079, + "learning_rate": 8.653541287250954e-06, + "loss": 0.5621, + "step": 7176 + }, + { + "epoch": 0.56, + "grad_norm": 1.0894080490903262, + "learning_rate": 8.6510515396277e-06, + "loss": 0.4987, + "step": 7177 + }, + { + "epoch": 0.56, + "grad_norm": 1.0904926296231676, + "learning_rate": 8.648561877170522e-06, + "loss": 0.4856, + "step": 7178 + }, + { + "epoch": 0.56, + "grad_norm": 1.0408307281677367, + "learning_rate": 8.64607230003661e-06, + "loss": 0.4872, + "step": 7179 + }, + { + "epoch": 0.56, + "grad_norm": 1.1827934757451357, + "learning_rate": 8.643582808383133e-06, + "loss": 0.5246, + "step": 7180 + }, + { + "epoch": 0.56, + "grad_norm": 1.3385979112418176, + "learning_rate": 8.641093402367272e-06, + "loss": 0.55, + "step": 7181 + }, + { + "epoch": 0.56, + "grad_norm": 1.1946439681128436, + "learning_rate": 8.638604082146195e-06, + "loss": 0.6115, + "step": 7182 + }, + { + "epoch": 0.56, + "grad_norm": 1.2237109915075493, + "learning_rate": 8.636114847877068e-06, + "loss": 0.5065, + "step": 7183 + }, + { + "epoch": 0.56, + "grad_norm": 1.3110380204664942, + "learning_rate": 8.633625699717051e-06, + "loss": 0.5248, + "step": 7184 + }, + { + "epoch": 0.56, + "grad_norm": 1.0426367756428179, + "learning_rate": 8.631136637823288e-06, + "loss": 0.5071, + "step": 7185 + }, + { + "epoch": 0.56, + "grad_norm": 1.1508867783857837, + "learning_rate": 8.628647662352932e-06, + "loss": 0.5257, + "step": 7186 + }, + { + "epoch": 0.56, + "grad_norm": 1.1011325186374112, + "learning_rate": 8.626158773463124e-06, + "loss": 0.539, + "step": 7187 + }, + { + "epoch": 0.56, + "grad_norm": 1.1338877869041564, + "learning_rate": 8.623669971311002e-06, + "loss": 0.4981, + "step": 7188 + }, + { + "epoch": 0.56, + "grad_norm": 1.2576198548823194, + "learning_rate": 8.621181256053699e-06, + "loss": 0.5641, + "step": 7189 + }, + { + "epoch": 0.56, + "grad_norm": 1.163963057603609, + "learning_rate": 8.618692627848331e-06, + "loss": 0.5258, + "step": 7190 + }, + { + "epoch": 0.56, + "grad_norm": 1.2466958722775159, + "learning_rate": 8.616204086852026e-06, + "loss": 0.5606, + "step": 7191 + }, + { + "epoch": 0.56, + "grad_norm": 1.2311160368099745, + "learning_rate": 8.613715633221895e-06, + "loss": 0.5969, + "step": 7192 + }, + { + "epoch": 0.56, + "grad_norm": 1.1809099949351327, + "learning_rate": 8.611227267115052e-06, + "loss": 0.5437, + "step": 7193 + }, + { + "epoch": 0.56, + "grad_norm": 1.2797272169496263, + "learning_rate": 8.608738988688598e-06, + "loss": 0.576, + "step": 7194 + }, + { + "epoch": 0.56, + "grad_norm": 1.1589875162388705, + "learning_rate": 8.606250798099626e-06, + "loss": 0.5715, + "step": 7195 + }, + { + "epoch": 0.56, + "grad_norm": 1.2012791531426092, + "learning_rate": 8.603762695505231e-06, + "loss": 0.5358, + "step": 7196 + }, + { + "epoch": 0.56, + "grad_norm": 1.1834082631220921, + "learning_rate": 8.601274681062502e-06, + "loss": 0.5103, + "step": 7197 + }, + { + "epoch": 0.56, + "grad_norm": 1.204446092060373, + "learning_rate": 8.598786754928519e-06, + "loss": 0.4844, + "step": 7198 + }, + { + "epoch": 0.56, + "grad_norm": 1.1348855354130147, + "learning_rate": 8.596298917260361e-06, + "loss": 0.5131, + "step": 7199 + }, + { + "epoch": 0.56, + "grad_norm": 1.2509721313683964, + "learning_rate": 8.59381116821509e-06, + "loss": 0.5928, + "step": 7200 + }, + { + "epoch": 0.56, + "grad_norm": 1.2113007523618666, + "learning_rate": 8.591323507949773e-06, + "loss": 0.5475, + "step": 7201 + }, + { + "epoch": 0.56, + "grad_norm": 1.113959764877169, + "learning_rate": 8.588835936621473e-06, + "loss": 0.5209, + "step": 7202 + }, + { + "epoch": 0.56, + "grad_norm": 1.149132861902752, + "learning_rate": 8.586348454387244e-06, + "loss": 0.5237, + "step": 7203 + }, + { + "epoch": 0.56, + "grad_norm": 1.1629394131473179, + "learning_rate": 8.583861061404131e-06, + "loss": 0.552, + "step": 7204 + }, + { + "epoch": 0.56, + "grad_norm": 1.0557785042725218, + "learning_rate": 8.581373757829172e-06, + "loss": 0.5328, + "step": 7205 + }, + { + "epoch": 0.56, + "grad_norm": 1.2137346509911722, + "learning_rate": 8.578886543819406e-06, + "loss": 0.5114, + "step": 7206 + }, + { + "epoch": 0.56, + "grad_norm": 1.0513011114370965, + "learning_rate": 8.576399419531865e-06, + "loss": 0.4742, + "step": 7207 + }, + { + "epoch": 0.56, + "grad_norm": 1.2870882579593583, + "learning_rate": 8.573912385123576e-06, + "loss": 0.5477, + "step": 7208 + }, + { + "epoch": 0.56, + "grad_norm": 1.2033791707062595, + "learning_rate": 8.571425440751557e-06, + "loss": 0.5532, + "step": 7209 + }, + { + "epoch": 0.56, + "grad_norm": 1.1112153275941123, + "learning_rate": 8.568938586572816e-06, + "loss": 0.544, + "step": 7210 + }, + { + "epoch": 0.56, + "grad_norm": 1.269166113355937, + "learning_rate": 8.566451822744367e-06, + "loss": 0.5632, + "step": 7211 + }, + { + "epoch": 0.56, + "grad_norm": 1.103377341999504, + "learning_rate": 8.563965149423207e-06, + "loss": 0.5588, + "step": 7212 + }, + { + "epoch": 0.56, + "grad_norm": 1.1829084168500636, + "learning_rate": 8.561478566766341e-06, + "loss": 0.5701, + "step": 7213 + }, + { + "epoch": 0.56, + "grad_norm": 1.1282394182083846, + "learning_rate": 8.558992074930757e-06, + "loss": 0.555, + "step": 7214 + }, + { + "epoch": 0.56, + "grad_norm": 1.1897374206442264, + "learning_rate": 8.556505674073435e-06, + "loss": 0.5654, + "step": 7215 + }, + { + "epoch": 0.56, + "grad_norm": 1.1373050396863185, + "learning_rate": 8.554019364351354e-06, + "loss": 0.5329, + "step": 7216 + }, + { + "epoch": 0.56, + "grad_norm": 1.2656010284037458, + "learning_rate": 8.551533145921493e-06, + "loss": 0.5823, + "step": 7217 + }, + { + "epoch": 0.56, + "grad_norm": 1.0771660894087123, + "learning_rate": 8.54904701894082e-06, + "loss": 0.5265, + "step": 7218 + }, + { + "epoch": 0.56, + "grad_norm": 1.1263101154842148, + "learning_rate": 8.54656098356629e-06, + "loss": 0.5315, + "step": 7219 + }, + { + "epoch": 0.56, + "grad_norm": 1.190866566980221, + "learning_rate": 8.54407503995487e-06, + "loss": 0.6044, + "step": 7220 + }, + { + "epoch": 0.56, + "grad_norm": 1.1645208390903092, + "learning_rate": 8.541589188263499e-06, + "loss": 0.5599, + "step": 7221 + }, + { + "epoch": 0.56, + "grad_norm": 1.3026463880665522, + "learning_rate": 8.53910342864913e-06, + "loss": 0.5796, + "step": 7222 + }, + { + "epoch": 0.56, + "grad_norm": 1.1717749998023268, + "learning_rate": 8.536617761268697e-06, + "loss": 0.5296, + "step": 7223 + }, + { + "epoch": 0.56, + "grad_norm": 1.1187839438973723, + "learning_rate": 8.534132186279134e-06, + "loss": 0.4973, + "step": 7224 + }, + { + "epoch": 0.56, + "grad_norm": 1.1452520919211118, + "learning_rate": 8.531646703837375e-06, + "loss": 0.515, + "step": 7225 + }, + { + "epoch": 0.56, + "grad_norm": 1.149693120562664, + "learning_rate": 8.52916131410033e-06, + "loss": 0.5897, + "step": 7226 + }, + { + "epoch": 0.56, + "grad_norm": 1.2401958790008578, + "learning_rate": 8.526676017224917e-06, + "loss": 0.4929, + "step": 7227 + }, + { + "epoch": 0.56, + "grad_norm": 1.0882690976704232, + "learning_rate": 8.524190813368055e-06, + "loss": 0.5132, + "step": 7228 + }, + { + "epoch": 0.56, + "grad_norm": 1.1196918295186495, + "learning_rate": 8.521705702686636e-06, + "loss": 0.5099, + "step": 7229 + }, + { + "epoch": 0.56, + "grad_norm": 1.0947322794265588, + "learning_rate": 8.519220685337567e-06, + "loss": 0.4871, + "step": 7230 + }, + { + "epoch": 0.56, + "grad_norm": 1.1736105718868504, + "learning_rate": 8.516735761477734e-06, + "loss": 0.5403, + "step": 7231 + }, + { + "epoch": 0.56, + "grad_norm": 1.1571712432386958, + "learning_rate": 8.514250931264023e-06, + "loss": 0.5147, + "step": 7232 + }, + { + "epoch": 0.56, + "grad_norm": 1.207567213253128, + "learning_rate": 8.51176619485332e-06, + "loss": 0.5414, + "step": 7233 + }, + { + "epoch": 0.56, + "grad_norm": 1.1109181806169475, + "learning_rate": 8.509281552402492e-06, + "loss": 0.4779, + "step": 7234 + }, + { + "epoch": 0.56, + "grad_norm": 1.1273065868869139, + "learning_rate": 8.506797004068414e-06, + "loss": 0.5445, + "step": 7235 + }, + { + "epoch": 0.56, + "grad_norm": 1.1870942175381705, + "learning_rate": 8.504312550007943e-06, + "loss": 0.5392, + "step": 7236 + }, + { + "epoch": 0.56, + "grad_norm": 1.144063964859484, + "learning_rate": 8.501828190377936e-06, + "loss": 0.5915, + "step": 7237 + }, + { + "epoch": 0.56, + "grad_norm": 1.2614229877202074, + "learning_rate": 8.499343925335249e-06, + "loss": 0.5357, + "step": 7238 + }, + { + "epoch": 0.56, + "grad_norm": 1.0709019729246994, + "learning_rate": 8.496859755036719e-06, + "loss": 0.5138, + "step": 7239 + }, + { + "epoch": 0.56, + "grad_norm": 1.2774763126006197, + "learning_rate": 8.49437567963919e-06, + "loss": 0.5745, + "step": 7240 + }, + { + "epoch": 0.56, + "grad_norm": 1.0626795560919822, + "learning_rate": 8.491891699299491e-06, + "loss": 0.5294, + "step": 7241 + }, + { + "epoch": 0.56, + "grad_norm": 1.0893981460152122, + "learning_rate": 8.48940781417445e-06, + "loss": 0.5322, + "step": 7242 + }, + { + "epoch": 0.56, + "grad_norm": 1.17873657502087, + "learning_rate": 8.486924024420887e-06, + "loss": 0.5115, + "step": 7243 + }, + { + "epoch": 0.56, + "grad_norm": 1.1926663098435129, + "learning_rate": 8.484440330195615e-06, + "loss": 0.528, + "step": 7244 + }, + { + "epoch": 0.56, + "grad_norm": 1.1950658406590542, + "learning_rate": 8.481956731655451e-06, + "loss": 0.4774, + "step": 7245 + }, + { + "epoch": 0.56, + "grad_norm": 1.1552145677724501, + "learning_rate": 8.479473228957185e-06, + "loss": 0.5308, + "step": 7246 + }, + { + "epoch": 0.56, + "grad_norm": 1.1907529947362159, + "learning_rate": 8.47698982225762e-06, + "loss": 0.5542, + "step": 7247 + }, + { + "epoch": 0.56, + "grad_norm": 1.1525258065931892, + "learning_rate": 8.474506511713543e-06, + "loss": 0.5582, + "step": 7248 + }, + { + "epoch": 0.56, + "grad_norm": 1.2058761155597932, + "learning_rate": 8.472023297481741e-06, + "loss": 0.5131, + "step": 7249 + }, + { + "epoch": 0.56, + "grad_norm": 1.1127829524045973, + "learning_rate": 8.469540179718997e-06, + "loss": 0.5099, + "step": 7250 + }, + { + "epoch": 0.56, + "grad_norm": 1.2010304933785343, + "learning_rate": 8.467057158582072e-06, + "loss": 0.5843, + "step": 7251 + }, + { + "epoch": 0.56, + "grad_norm": 1.3161147612735844, + "learning_rate": 8.46457423422774e-06, + "loss": 0.5905, + "step": 7252 + }, + { + "epoch": 0.56, + "grad_norm": 1.128347872542259, + "learning_rate": 8.462091406812759e-06, + "loss": 0.5235, + "step": 7253 + }, + { + "epoch": 0.56, + "grad_norm": 1.0829132561644017, + "learning_rate": 8.459608676493878e-06, + "loss": 0.4717, + "step": 7254 + }, + { + "epoch": 0.56, + "grad_norm": 1.1453541476183215, + "learning_rate": 8.457126043427855e-06, + "loss": 0.5432, + "step": 7255 + }, + { + "epoch": 0.56, + "grad_norm": 1.2237496164260901, + "learning_rate": 8.45464350777142e-06, + "loss": 0.5754, + "step": 7256 + }, + { + "epoch": 0.56, + "grad_norm": 1.1572886776248892, + "learning_rate": 8.452161069681315e-06, + "loss": 0.5131, + "step": 7257 + }, + { + "epoch": 0.56, + "grad_norm": 1.2030709985280958, + "learning_rate": 8.449678729314266e-06, + "loss": 0.5472, + "step": 7258 + }, + { + "epoch": 0.56, + "grad_norm": 1.1487454371315013, + "learning_rate": 8.447196486826996e-06, + "loss": 0.533, + "step": 7259 + }, + { + "epoch": 0.56, + "grad_norm": 1.0968306073283058, + "learning_rate": 8.44471434237623e-06, + "loss": 0.4957, + "step": 7260 + }, + { + "epoch": 0.56, + "grad_norm": 1.175836099251446, + "learning_rate": 8.442232296118667e-06, + "loss": 0.5089, + "step": 7261 + }, + { + "epoch": 0.56, + "grad_norm": 1.1631159678441056, + "learning_rate": 8.439750348211016e-06, + "loss": 0.5248, + "step": 7262 + }, + { + "epoch": 0.56, + "grad_norm": 1.183753377391988, + "learning_rate": 8.437268498809975e-06, + "loss": 0.5329, + "step": 7263 + }, + { + "epoch": 0.56, + "grad_norm": 1.2399476688632751, + "learning_rate": 8.434786748072237e-06, + "loss": 0.6115, + "step": 7264 + }, + { + "epoch": 0.56, + "grad_norm": 1.0345711836073948, + "learning_rate": 8.43230509615449e-06, + "loss": 0.5, + "step": 7265 + }, + { + "epoch": 0.56, + "grad_norm": 1.2119532601090794, + "learning_rate": 8.429823543213406e-06, + "loss": 0.5383, + "step": 7266 + }, + { + "epoch": 0.56, + "grad_norm": 1.0496316172745663, + "learning_rate": 8.427342089405667e-06, + "loss": 0.4923, + "step": 7267 + }, + { + "epoch": 0.56, + "grad_norm": 1.192263985476433, + "learning_rate": 8.424860734887932e-06, + "loss": 0.5526, + "step": 7268 + }, + { + "epoch": 0.56, + "grad_norm": 1.2567423657469419, + "learning_rate": 8.422379479816865e-06, + "loss": 0.5885, + "step": 7269 + }, + { + "epoch": 0.56, + "grad_norm": 1.2610462865025873, + "learning_rate": 8.419898324349122e-06, + "loss": 0.492, + "step": 7270 + }, + { + "epoch": 0.56, + "grad_norm": 1.1361697729511353, + "learning_rate": 8.417417268641354e-06, + "loss": 0.5552, + "step": 7271 + }, + { + "epoch": 0.56, + "grad_norm": 1.1669624101359166, + "learning_rate": 8.414936312850196e-06, + "loss": 0.5574, + "step": 7272 + }, + { + "epoch": 0.56, + "grad_norm": 1.2420983432467272, + "learning_rate": 8.412455457132285e-06, + "loss": 0.5897, + "step": 7273 + }, + { + "epoch": 0.56, + "grad_norm": 1.2690424990680642, + "learning_rate": 8.409974701644251e-06, + "loss": 0.5629, + "step": 7274 + }, + { + "epoch": 0.56, + "grad_norm": 1.097771146469405, + "learning_rate": 8.40749404654272e-06, + "loss": 0.5428, + "step": 7275 + }, + { + "epoch": 0.56, + "grad_norm": 1.1377758865311753, + "learning_rate": 8.405013491984307e-06, + "loss": 0.5241, + "step": 7276 + }, + { + "epoch": 0.56, + "grad_norm": 1.2046186909895835, + "learning_rate": 8.40253303812562e-06, + "loss": 0.5155, + "step": 7277 + }, + { + "epoch": 0.56, + "grad_norm": 1.18020295256656, + "learning_rate": 8.400052685123263e-06, + "loss": 0.5699, + "step": 7278 + }, + { + "epoch": 0.56, + "grad_norm": 1.1504131632282575, + "learning_rate": 8.397572433133836e-06, + "loss": 0.5481, + "step": 7279 + }, + { + "epoch": 0.56, + "grad_norm": 1.2071228980174176, + "learning_rate": 8.395092282313927e-06, + "loss": 0.5788, + "step": 7280 + }, + { + "epoch": 0.56, + "grad_norm": 1.0344017878172516, + "learning_rate": 8.392612232820125e-06, + "loss": 0.4855, + "step": 7281 + }, + { + "epoch": 0.56, + "grad_norm": 1.1339002976899328, + "learning_rate": 8.390132284809005e-06, + "loss": 0.538, + "step": 7282 + }, + { + "epoch": 0.57, + "grad_norm": 1.1394780807194689, + "learning_rate": 8.387652438437138e-06, + "loss": 0.536, + "step": 7283 + }, + { + "epoch": 0.57, + "grad_norm": 1.139242196266246, + "learning_rate": 8.385172693861092e-06, + "loss": 0.5599, + "step": 7284 + }, + { + "epoch": 0.57, + "grad_norm": 1.1531086080569763, + "learning_rate": 8.382693051237424e-06, + "loss": 0.5404, + "step": 7285 + }, + { + "epoch": 0.57, + "grad_norm": 1.1701272774612959, + "learning_rate": 8.380213510722692e-06, + "loss": 0.5402, + "step": 7286 + }, + { + "epoch": 0.57, + "grad_norm": 1.3134095809477861, + "learning_rate": 8.377734072473437e-06, + "loss": 0.6476, + "step": 7287 + }, + { + "epoch": 0.57, + "grad_norm": 1.1960375213861296, + "learning_rate": 8.375254736646197e-06, + "loss": 0.5625, + "step": 7288 + }, + { + "epoch": 0.57, + "grad_norm": 1.157223678134591, + "learning_rate": 8.372775503397507e-06, + "loss": 0.5332, + "step": 7289 + }, + { + "epoch": 0.57, + "grad_norm": 1.0540106402842708, + "learning_rate": 8.370296372883898e-06, + "loss": 0.5107, + "step": 7290 + }, + { + "epoch": 0.57, + "grad_norm": 1.1678402424927279, + "learning_rate": 8.367817345261888e-06, + "loss": 0.5454, + "step": 7291 + }, + { + "epoch": 0.57, + "grad_norm": 1.2041928890149463, + "learning_rate": 8.36533842068799e-06, + "loss": 0.5472, + "step": 7292 + }, + { + "epoch": 0.57, + "grad_norm": 1.1474760533412618, + "learning_rate": 8.362859599318708e-06, + "loss": 0.5468, + "step": 7293 + }, + { + "epoch": 0.57, + "grad_norm": 1.2105314496889517, + "learning_rate": 8.360380881310545e-06, + "loss": 0.5101, + "step": 7294 + }, + { + "epoch": 0.57, + "grad_norm": 1.192729827598338, + "learning_rate": 8.357902266819999e-06, + "loss": 0.5766, + "step": 7295 + }, + { + "epoch": 0.57, + "grad_norm": 1.3202619373639282, + "learning_rate": 8.355423756003557e-06, + "loss": 0.5666, + "step": 7296 + }, + { + "epoch": 0.57, + "grad_norm": 1.123303777087027, + "learning_rate": 8.352945349017699e-06, + "loss": 0.5174, + "step": 7297 + }, + { + "epoch": 0.57, + "grad_norm": 1.0945743179288843, + "learning_rate": 8.350467046018892e-06, + "loss": 0.5087, + "step": 7298 + }, + { + "epoch": 0.57, + "grad_norm": 1.1474797933124947, + "learning_rate": 8.347988847163615e-06, + "loss": 0.5414, + "step": 7299 + }, + { + "epoch": 0.57, + "grad_norm": 1.0913709652548347, + "learning_rate": 8.345510752608324e-06, + "loss": 0.503, + "step": 7300 + }, + { + "epoch": 0.57, + "grad_norm": 1.1080336722549131, + "learning_rate": 8.343032762509478e-06, + "loss": 0.5209, + "step": 7301 + }, + { + "epoch": 0.57, + "grad_norm": 1.13434097827514, + "learning_rate": 8.340554877023523e-06, + "loss": 0.5619, + "step": 7302 + }, + { + "epoch": 0.57, + "grad_norm": 1.0580863722066831, + "learning_rate": 8.338077096306895e-06, + "loss": 0.4432, + "step": 7303 + }, + { + "epoch": 0.57, + "grad_norm": 1.112115242677624, + "learning_rate": 8.335599420516036e-06, + "loss": 0.5087, + "step": 7304 + }, + { + "epoch": 0.57, + "grad_norm": 1.1741007711972185, + "learning_rate": 8.333121849807374e-06, + "loss": 0.5509, + "step": 7305 + }, + { + "epoch": 0.57, + "grad_norm": 1.229404582689245, + "learning_rate": 8.330644384337334e-06, + "loss": 0.5892, + "step": 7306 + }, + { + "epoch": 0.57, + "grad_norm": 1.163419353826195, + "learning_rate": 8.328167024262323e-06, + "loss": 0.5503, + "step": 7307 + }, + { + "epoch": 0.57, + "grad_norm": 1.2644014448718197, + "learning_rate": 8.32568976973875e-06, + "loss": 0.5323, + "step": 7308 + }, + { + "epoch": 0.57, + "grad_norm": 1.204033595237437, + "learning_rate": 8.323212620923023e-06, + "loss": 0.5449, + "step": 7309 + }, + { + "epoch": 0.57, + "grad_norm": 1.1461771824794256, + "learning_rate": 8.320735577971533e-06, + "loss": 0.5388, + "step": 7310 + }, + { + "epoch": 0.57, + "grad_norm": 1.2193195772700351, + "learning_rate": 8.318258641040674e-06, + "loss": 0.5264, + "step": 7311 + }, + { + "epoch": 0.57, + "grad_norm": 1.2366653633282179, + "learning_rate": 8.31578181028682e-06, + "loss": 0.5255, + "step": 7312 + }, + { + "epoch": 0.57, + "grad_norm": 1.0600838396816714, + "learning_rate": 8.31330508586635e-06, + "loss": 0.5186, + "step": 7313 + }, + { + "epoch": 0.57, + "grad_norm": 1.3015407143700979, + "learning_rate": 8.31082846793563e-06, + "loss": 0.6232, + "step": 7314 + }, + { + "epoch": 0.57, + "grad_norm": 1.07671999948839, + "learning_rate": 8.308351956651026e-06, + "loss": 0.5044, + "step": 7315 + }, + { + "epoch": 0.57, + "grad_norm": 1.5728134811947052, + "learning_rate": 8.305875552168894e-06, + "loss": 0.5224, + "step": 7316 + }, + { + "epoch": 0.57, + "grad_norm": 1.1416076190092765, + "learning_rate": 8.303399254645578e-06, + "loss": 0.4906, + "step": 7317 + }, + { + "epoch": 0.57, + "grad_norm": 1.1407239688107764, + "learning_rate": 8.300923064237417e-06, + "loss": 0.5019, + "step": 7318 + }, + { + "epoch": 0.57, + "grad_norm": 1.2180392809517295, + "learning_rate": 8.298446981100749e-06, + "loss": 0.5711, + "step": 7319 + }, + { + "epoch": 0.57, + "grad_norm": 1.1200144287440175, + "learning_rate": 8.295971005391902e-06, + "loss": 0.5164, + "step": 7320 + }, + { + "epoch": 0.57, + "grad_norm": 1.2108949223077898, + "learning_rate": 8.2934951372672e-06, + "loss": 0.5398, + "step": 7321 + }, + { + "epoch": 0.57, + "grad_norm": 1.2023682629600465, + "learning_rate": 8.291019376882955e-06, + "loss": 0.5501, + "step": 7322 + }, + { + "epoch": 0.57, + "grad_norm": 1.190400246059741, + "learning_rate": 8.28854372439547e-06, + "loss": 0.5521, + "step": 7323 + }, + { + "epoch": 0.57, + "grad_norm": 1.1737444398725967, + "learning_rate": 8.28606817996105e-06, + "loss": 0.5256, + "step": 7324 + }, + { + "epoch": 0.57, + "grad_norm": 1.237148163611668, + "learning_rate": 8.283592743735988e-06, + "loss": 0.5686, + "step": 7325 + }, + { + "epoch": 0.57, + "grad_norm": 1.2128477227183603, + "learning_rate": 8.281117415876574e-06, + "loss": 0.5371, + "step": 7326 + }, + { + "epoch": 0.57, + "grad_norm": 1.254491461501668, + "learning_rate": 8.27864219653909e-06, + "loss": 0.533, + "step": 7327 + }, + { + "epoch": 0.57, + "grad_norm": 1.067887559440243, + "learning_rate": 8.276167085879798e-06, + "loss": 0.5263, + "step": 7328 + }, + { + "epoch": 0.57, + "grad_norm": 1.2076692345693945, + "learning_rate": 8.273692084054974e-06, + "loss": 0.555, + "step": 7329 + }, + { + "epoch": 0.57, + "grad_norm": 1.268412593081061, + "learning_rate": 8.271217191220874e-06, + "loss": 0.6188, + "step": 7330 + }, + { + "epoch": 0.57, + "grad_norm": 1.2103028141562966, + "learning_rate": 8.268742407533754e-06, + "loss": 0.58, + "step": 7331 + }, + { + "epoch": 0.57, + "grad_norm": 1.1120631463934354, + "learning_rate": 8.26626773314986e-06, + "loss": 0.5115, + "step": 7332 + }, + { + "epoch": 0.57, + "grad_norm": 1.1895547958041968, + "learning_rate": 8.263793168225425e-06, + "loss": 0.5811, + "step": 7333 + }, + { + "epoch": 0.57, + "grad_norm": 1.2021514124023913, + "learning_rate": 8.261318712916685e-06, + "loss": 0.5762, + "step": 7334 + }, + { + "epoch": 0.57, + "grad_norm": 1.1191333284522993, + "learning_rate": 8.258844367379866e-06, + "loss": 0.4982, + "step": 7335 + }, + { + "epoch": 0.57, + "grad_norm": 1.140360239733101, + "learning_rate": 8.256370131771185e-06, + "loss": 0.5017, + "step": 7336 + }, + { + "epoch": 0.57, + "grad_norm": 1.246165641217594, + "learning_rate": 8.253896006246858e-06, + "loss": 0.5016, + "step": 7337 + }, + { + "epoch": 0.57, + "grad_norm": 1.086722275641157, + "learning_rate": 8.25142199096308e-06, + "loss": 0.5661, + "step": 7338 + }, + { + "epoch": 0.57, + "grad_norm": 1.1408019255240671, + "learning_rate": 8.248948086076052e-06, + "loss": 0.5342, + "step": 7339 + }, + { + "epoch": 0.57, + "grad_norm": 1.1338437877922387, + "learning_rate": 8.24647429174197e-06, + "loss": 0.5049, + "step": 7340 + }, + { + "epoch": 0.57, + "grad_norm": 1.1914391685065533, + "learning_rate": 8.244000608117009e-06, + "loss": 0.5201, + "step": 7341 + }, + { + "epoch": 0.57, + "grad_norm": 1.178209199603273, + "learning_rate": 8.241527035357354e-06, + "loss": 0.5986, + "step": 7342 + }, + { + "epoch": 0.57, + "grad_norm": 1.106167791296938, + "learning_rate": 8.239053573619164e-06, + "loss": 0.5394, + "step": 7343 + }, + { + "epoch": 0.57, + "grad_norm": 1.0767330084479267, + "learning_rate": 8.23658022305861e-06, + "loss": 0.4816, + "step": 7344 + }, + { + "epoch": 0.57, + "grad_norm": 1.1955369726101355, + "learning_rate": 8.234106983831846e-06, + "loss": 0.5693, + "step": 7345 + }, + { + "epoch": 0.57, + "grad_norm": 1.1524646243046868, + "learning_rate": 8.231633856095015e-06, + "loss": 0.5273, + "step": 7346 + }, + { + "epoch": 0.57, + "grad_norm": 1.2637895532261076, + "learning_rate": 8.22916084000427e-06, + "loss": 0.5234, + "step": 7347 + }, + { + "epoch": 0.57, + "grad_norm": 1.1900154375629686, + "learning_rate": 8.22668793571573e-06, + "loss": 0.5517, + "step": 7348 + }, + { + "epoch": 0.57, + "grad_norm": 1.1714373979501782, + "learning_rate": 8.22421514338553e-06, + "loss": 0.5775, + "step": 7349 + }, + { + "epoch": 0.57, + "grad_norm": 1.0615243920512325, + "learning_rate": 8.221742463169794e-06, + "loss": 0.5222, + "step": 7350 + }, + { + "epoch": 0.57, + "grad_norm": 1.2751697389898624, + "learning_rate": 8.219269895224627e-06, + "loss": 0.533, + "step": 7351 + }, + { + "epoch": 0.57, + "grad_norm": 1.132269367371601, + "learning_rate": 8.216797439706142e-06, + "loss": 0.5235, + "step": 7352 + }, + { + "epoch": 0.57, + "grad_norm": 1.1282529953811817, + "learning_rate": 8.214325096770433e-06, + "loss": 0.4858, + "step": 7353 + }, + { + "epoch": 0.57, + "grad_norm": 1.1682322023204603, + "learning_rate": 8.211852866573591e-06, + "loss": 0.5311, + "step": 7354 + }, + { + "epoch": 0.57, + "grad_norm": 1.2195835197630178, + "learning_rate": 8.209380749271708e-06, + "loss": 0.5723, + "step": 7355 + }, + { + "epoch": 0.57, + "grad_norm": 1.2121316251885463, + "learning_rate": 8.206908745020852e-06, + "loss": 0.5515, + "step": 7356 + }, + { + "epoch": 0.57, + "grad_norm": 1.3240574651302344, + "learning_rate": 8.204436853977105e-06, + "loss": 0.5828, + "step": 7357 + }, + { + "epoch": 0.57, + "grad_norm": 1.1867401804191295, + "learning_rate": 8.201965076296518e-06, + "loss": 0.6011, + "step": 7358 + }, + { + "epoch": 0.57, + "grad_norm": 1.1582329217123883, + "learning_rate": 8.199493412135152e-06, + "loss": 0.5356, + "step": 7359 + }, + { + "epoch": 0.57, + "grad_norm": 1.2143649908842091, + "learning_rate": 8.197021861649059e-06, + "loss": 0.5619, + "step": 7360 + }, + { + "epoch": 0.57, + "grad_norm": 1.2721781676289097, + "learning_rate": 8.194550424994274e-06, + "loss": 0.586, + "step": 7361 + }, + { + "epoch": 0.57, + "grad_norm": 1.2765637737440574, + "learning_rate": 8.192079102326842e-06, + "loss": 0.5512, + "step": 7362 + }, + { + "epoch": 0.57, + "grad_norm": 1.2384502885164446, + "learning_rate": 8.189607893802779e-06, + "loss": 0.5519, + "step": 7363 + }, + { + "epoch": 0.57, + "grad_norm": 1.2799023309340236, + "learning_rate": 8.187136799578111e-06, + "loss": 0.6006, + "step": 7364 + }, + { + "epoch": 0.57, + "grad_norm": 1.2247462340665136, + "learning_rate": 8.18466581980885e-06, + "loss": 0.566, + "step": 7365 + }, + { + "epoch": 0.57, + "grad_norm": 1.1231214946337613, + "learning_rate": 8.182194954651e-06, + "loss": 0.5144, + "step": 7366 + }, + { + "epoch": 0.57, + "grad_norm": 1.143971433206162, + "learning_rate": 8.179724204260567e-06, + "loss": 0.5299, + "step": 7367 + }, + { + "epoch": 0.57, + "grad_norm": 1.1346617764409792, + "learning_rate": 8.17725356879353e-06, + "loss": 0.5151, + "step": 7368 + }, + { + "epoch": 0.57, + "grad_norm": 1.131069763325533, + "learning_rate": 8.174783048405882e-06, + "loss": 0.548, + "step": 7369 + }, + { + "epoch": 0.57, + "grad_norm": 1.220352781756787, + "learning_rate": 8.172312643253597e-06, + "loss": 0.5244, + "step": 7370 + }, + { + "epoch": 0.57, + "grad_norm": 1.2574127699399982, + "learning_rate": 8.169842353492644e-06, + "loss": 0.5885, + "step": 7371 + }, + { + "epoch": 0.57, + "grad_norm": 1.1841725360063617, + "learning_rate": 8.167372179278988e-06, + "loss": 0.5662, + "step": 7372 + }, + { + "epoch": 0.57, + "grad_norm": 1.137280826622926, + "learning_rate": 8.164902120768578e-06, + "loss": 0.5198, + "step": 7373 + }, + { + "epoch": 0.57, + "grad_norm": 1.263724324524932, + "learning_rate": 8.162432178117365e-06, + "loss": 0.6203, + "step": 7374 + }, + { + "epoch": 0.57, + "grad_norm": 1.1550726695959108, + "learning_rate": 8.159962351481291e-06, + "loss": 0.5361, + "step": 7375 + }, + { + "epoch": 0.57, + "grad_norm": 1.1247691871378398, + "learning_rate": 8.157492641016285e-06, + "loss": 0.5336, + "step": 7376 + }, + { + "epoch": 0.57, + "grad_norm": 1.0661411165407342, + "learning_rate": 8.15502304687828e-06, + "loss": 0.4792, + "step": 7377 + }, + { + "epoch": 0.57, + "grad_norm": 1.106380558391724, + "learning_rate": 8.152553569223183e-06, + "loss": 0.5188, + "step": 7378 + }, + { + "epoch": 0.57, + "grad_norm": 1.3108831619440742, + "learning_rate": 8.150084208206912e-06, + "loss": 0.5872, + "step": 7379 + }, + { + "epoch": 0.57, + "grad_norm": 1.1911717418571253, + "learning_rate": 8.147614963985371e-06, + "loss": 0.5382, + "step": 7380 + }, + { + "epoch": 0.57, + "grad_norm": 1.1009492722898906, + "learning_rate": 8.145145836714452e-06, + "loss": 0.4933, + "step": 7381 + }, + { + "epoch": 0.57, + "grad_norm": 1.2646844453729982, + "learning_rate": 8.142676826550046e-06, + "loss": 0.5811, + "step": 7382 + }, + { + "epoch": 0.57, + "grad_norm": 1.1039165477447856, + "learning_rate": 8.14020793364804e-06, + "loss": 0.535, + "step": 7383 + }, + { + "epoch": 0.57, + "grad_norm": 1.1592916227317223, + "learning_rate": 8.137739158164297e-06, + "loss": 0.5634, + "step": 7384 + }, + { + "epoch": 0.57, + "grad_norm": 1.2085740682704509, + "learning_rate": 8.135270500254692e-06, + "loss": 0.5594, + "step": 7385 + }, + { + "epoch": 0.57, + "grad_norm": 1.1477347578874182, + "learning_rate": 8.132801960075077e-06, + "loss": 0.5134, + "step": 7386 + }, + { + "epoch": 0.57, + "grad_norm": 1.151745191391416, + "learning_rate": 8.130333537781309e-06, + "loss": 0.5551, + "step": 7387 + }, + { + "epoch": 0.57, + "grad_norm": 1.1227198912567786, + "learning_rate": 8.127865233529235e-06, + "loss": 0.5144, + "step": 7388 + }, + { + "epoch": 0.57, + "grad_norm": 1.147854196241342, + "learning_rate": 8.125397047474684e-06, + "loss": 0.5008, + "step": 7389 + }, + { + "epoch": 0.57, + "grad_norm": 1.2012506722757004, + "learning_rate": 8.122928979773491e-06, + "loss": 0.5657, + "step": 7390 + }, + { + "epoch": 0.57, + "grad_norm": 1.1194352166673986, + "learning_rate": 8.120461030581474e-06, + "loss": 0.4959, + "step": 7391 + }, + { + "epoch": 0.57, + "grad_norm": 1.2512962772527103, + "learning_rate": 8.117993200054449e-06, + "loss": 0.563, + "step": 7392 + }, + { + "epoch": 0.57, + "grad_norm": 1.107005261295845, + "learning_rate": 8.115525488348227e-06, + "loss": 0.4826, + "step": 7393 + }, + { + "epoch": 0.57, + "grad_norm": 1.0240724064775895, + "learning_rate": 8.1130578956186e-06, + "loss": 0.4592, + "step": 7394 + }, + { + "epoch": 0.57, + "grad_norm": 1.1889458938749113, + "learning_rate": 8.110590422021365e-06, + "loss": 0.5709, + "step": 7395 + }, + { + "epoch": 0.57, + "grad_norm": 1.2442022335302863, + "learning_rate": 8.108123067712302e-06, + "loss": 0.5585, + "step": 7396 + }, + { + "epoch": 0.57, + "grad_norm": 1.2719706412681977, + "learning_rate": 8.105655832847193e-06, + "loss": 0.5882, + "step": 7397 + }, + { + "epoch": 0.57, + "grad_norm": 1.2631577355296888, + "learning_rate": 8.103188717581808e-06, + "loss": 0.5847, + "step": 7398 + }, + { + "epoch": 0.57, + "grad_norm": 1.2090703043189124, + "learning_rate": 8.100721722071899e-06, + "loss": 0.5252, + "step": 7399 + }, + { + "epoch": 0.57, + "grad_norm": 1.2898948063210043, + "learning_rate": 8.09825484647323e-06, + "loss": 0.5635, + "step": 7400 + }, + { + "epoch": 0.57, + "grad_norm": 1.212812043376425, + "learning_rate": 8.095788090941543e-06, + "loss": 0.4894, + "step": 7401 + }, + { + "epoch": 0.57, + "grad_norm": 1.1037216129303111, + "learning_rate": 8.093321455632578e-06, + "loss": 0.5261, + "step": 7402 + }, + { + "epoch": 0.57, + "grad_norm": 1.217128751264944, + "learning_rate": 8.09085494070207e-06, + "loss": 0.5909, + "step": 7403 + }, + { + "epoch": 0.57, + "grad_norm": 1.1675637860405172, + "learning_rate": 8.088388546305737e-06, + "loss": 0.5218, + "step": 7404 + }, + { + "epoch": 0.57, + "grad_norm": 1.1547107761292843, + "learning_rate": 8.085922272599297e-06, + "loss": 0.4862, + "step": 7405 + }, + { + "epoch": 0.57, + "grad_norm": 1.1765822410193867, + "learning_rate": 8.08345611973846e-06, + "loss": 0.5368, + "step": 7406 + }, + { + "epoch": 0.57, + "grad_norm": 1.1904028497538757, + "learning_rate": 8.080990087878925e-06, + "loss": 0.5099, + "step": 7407 + }, + { + "epoch": 0.57, + "grad_norm": 1.1265825161366, + "learning_rate": 8.078524177176392e-06, + "loss": 0.5165, + "step": 7408 + }, + { + "epoch": 0.57, + "grad_norm": 1.108522759409965, + "learning_rate": 8.076058387786536e-06, + "loss": 0.4836, + "step": 7409 + }, + { + "epoch": 0.57, + "grad_norm": 1.1494970299813894, + "learning_rate": 8.073592719865041e-06, + "loss": 0.4565, + "step": 7410 + }, + { + "epoch": 0.57, + "grad_norm": 1.2419361843938974, + "learning_rate": 8.071127173567576e-06, + "loss": 0.573, + "step": 7411 + }, + { + "epoch": 0.58, + "grad_norm": 1.1678633626386437, + "learning_rate": 8.068661749049805e-06, + "loss": 0.5478, + "step": 7412 + }, + { + "epoch": 0.58, + "grad_norm": 1.2158832579714967, + "learning_rate": 8.066196446467385e-06, + "loss": 0.5682, + "step": 7413 + }, + { + "epoch": 0.58, + "grad_norm": 1.144539946955945, + "learning_rate": 8.063731265975955e-06, + "loss": 0.58, + "step": 7414 + }, + { + "epoch": 0.58, + "grad_norm": 1.2938475604290853, + "learning_rate": 8.061266207731165e-06, + "loss": 0.5421, + "step": 7415 + }, + { + "epoch": 0.58, + "grad_norm": 1.2323920825192576, + "learning_rate": 8.058801271888637e-06, + "loss": 0.5627, + "step": 7416 + }, + { + "epoch": 0.58, + "grad_norm": 1.2274845812231219, + "learning_rate": 8.056336458604002e-06, + "loss": 0.5575, + "step": 7417 + }, + { + "epoch": 0.58, + "grad_norm": 1.1157593568738184, + "learning_rate": 8.053871768032878e-06, + "loss": 0.5078, + "step": 7418 + }, + { + "epoch": 0.58, + "grad_norm": 1.1072647548406465, + "learning_rate": 8.051407200330866e-06, + "loss": 0.5665, + "step": 7419 + }, + { + "epoch": 0.58, + "grad_norm": 1.1389622522517782, + "learning_rate": 8.048942755653573e-06, + "loss": 0.4904, + "step": 7420 + }, + { + "epoch": 0.58, + "grad_norm": 1.1711855067977095, + "learning_rate": 8.046478434156588e-06, + "loss": 0.5653, + "step": 7421 + }, + { + "epoch": 0.58, + "grad_norm": 1.287530902380584, + "learning_rate": 8.044014235995496e-06, + "loss": 0.5378, + "step": 7422 + }, + { + "epoch": 0.58, + "grad_norm": 1.1900621680842285, + "learning_rate": 8.041550161325884e-06, + "loss": 0.5696, + "step": 7423 + }, + { + "epoch": 0.58, + "grad_norm": 1.1822198630331715, + "learning_rate": 8.039086210303308e-06, + "loss": 0.5236, + "step": 7424 + }, + { + "epoch": 0.58, + "grad_norm": 1.107481939715341, + "learning_rate": 8.036622383083341e-06, + "loss": 0.5087, + "step": 7425 + }, + { + "epoch": 0.58, + "grad_norm": 1.188553693990787, + "learning_rate": 8.034158679821529e-06, + "loss": 0.5522, + "step": 7426 + }, + { + "epoch": 0.58, + "grad_norm": 1.2020325593235863, + "learning_rate": 8.031695100673423e-06, + "loss": 0.5276, + "step": 7427 + }, + { + "epoch": 0.58, + "grad_norm": 1.094103456333685, + "learning_rate": 8.029231645794564e-06, + "loss": 0.5193, + "step": 7428 + }, + { + "epoch": 0.58, + "grad_norm": 1.1391752775899786, + "learning_rate": 8.026768315340475e-06, + "loss": 0.5384, + "step": 7429 + }, + { + "epoch": 0.58, + "grad_norm": 1.1254403524061307, + "learning_rate": 8.024305109466685e-06, + "loss": 0.5018, + "step": 7430 + }, + { + "epoch": 0.58, + "grad_norm": 1.3492063149573306, + "learning_rate": 8.021842028328703e-06, + "loss": 0.621, + "step": 7431 + }, + { + "epoch": 0.58, + "grad_norm": 1.3184535466221496, + "learning_rate": 8.01937907208204e-06, + "loss": 0.5571, + "step": 7432 + }, + { + "epoch": 0.58, + "grad_norm": 1.1071027669372246, + "learning_rate": 8.016916240882202e-06, + "loss": 0.5073, + "step": 7433 + }, + { + "epoch": 0.58, + "grad_norm": 1.125009324776893, + "learning_rate": 8.014453534884666e-06, + "loss": 0.5359, + "step": 7434 + }, + { + "epoch": 0.58, + "grad_norm": 1.1636805579817773, + "learning_rate": 8.011990954244926e-06, + "loss": 0.55, + "step": 7435 + }, + { + "epoch": 0.58, + "grad_norm": 1.1802868867720506, + "learning_rate": 8.00952849911845e-06, + "loss": 0.4918, + "step": 7436 + }, + { + "epoch": 0.58, + "grad_norm": 1.0579360664354809, + "learning_rate": 8.007066169660711e-06, + "loss": 0.5026, + "step": 7437 + }, + { + "epoch": 0.58, + "grad_norm": 1.1517000113673141, + "learning_rate": 8.004603966027167e-06, + "loss": 0.5381, + "step": 7438 + }, + { + "epoch": 0.58, + "grad_norm": 1.2140958391126289, + "learning_rate": 8.002141888373274e-06, + "loss": 0.6047, + "step": 7439 + }, + { + "epoch": 0.58, + "grad_norm": 1.206806297697629, + "learning_rate": 7.999679936854467e-06, + "loss": 0.6038, + "step": 7440 + }, + { + "epoch": 0.58, + "grad_norm": 1.123099682437124, + "learning_rate": 7.997218111626186e-06, + "loss": 0.5236, + "step": 7441 + }, + { + "epoch": 0.58, + "grad_norm": 1.1292954668203181, + "learning_rate": 7.994756412843857e-06, + "loss": 0.5644, + "step": 7442 + }, + { + "epoch": 0.58, + "grad_norm": 1.243385073545735, + "learning_rate": 7.992294840662904e-06, + "loss": 0.521, + "step": 7443 + }, + { + "epoch": 0.58, + "grad_norm": 1.1395898066772119, + "learning_rate": 7.989833395238736e-06, + "loss": 0.5561, + "step": 7444 + }, + { + "epoch": 0.58, + "grad_norm": 1.1119903575801624, + "learning_rate": 7.987372076726758e-06, + "loss": 0.5033, + "step": 7445 + }, + { + "epoch": 0.58, + "grad_norm": 1.1777031484737517, + "learning_rate": 7.984910885282361e-06, + "loss": 0.604, + "step": 7446 + }, + { + "epoch": 0.58, + "grad_norm": 1.1155423943206333, + "learning_rate": 7.982449821060936e-06, + "loss": 0.4984, + "step": 7447 + }, + { + "epoch": 0.58, + "grad_norm": 1.1368127108932402, + "learning_rate": 7.979988884217867e-06, + "loss": 0.5662, + "step": 7448 + }, + { + "epoch": 0.58, + "grad_norm": 1.2103154707521777, + "learning_rate": 7.97752807490852e-06, + "loss": 0.5269, + "step": 7449 + }, + { + "epoch": 0.58, + "grad_norm": 1.2905908944932365, + "learning_rate": 7.97506739328826e-06, + "loss": 0.5872, + "step": 7450 + }, + { + "epoch": 0.58, + "grad_norm": 1.1726942123581245, + "learning_rate": 7.972606839512442e-06, + "loss": 0.5305, + "step": 7451 + }, + { + "epoch": 0.58, + "grad_norm": 1.2484401507104799, + "learning_rate": 7.970146413736414e-06, + "loss": 0.5875, + "step": 7452 + }, + { + "epoch": 0.58, + "grad_norm": 1.1762299552774345, + "learning_rate": 7.967686116115517e-06, + "loss": 0.5538, + "step": 7453 + }, + { + "epoch": 0.58, + "grad_norm": 1.3523709740149084, + "learning_rate": 7.965225946805081e-06, + "loss": 0.5547, + "step": 7454 + }, + { + "epoch": 0.58, + "grad_norm": 1.1544494528065696, + "learning_rate": 7.962765905960428e-06, + "loss": 0.5179, + "step": 7455 + }, + { + "epoch": 0.58, + "grad_norm": 1.1057632108033297, + "learning_rate": 7.960305993736874e-06, + "loss": 0.5098, + "step": 7456 + }, + { + "epoch": 0.58, + "grad_norm": 1.2342755724301664, + "learning_rate": 7.957846210289725e-06, + "loss": 0.6316, + "step": 7457 + }, + { + "epoch": 0.58, + "grad_norm": 1.2917100017210297, + "learning_rate": 7.955386555774284e-06, + "loss": 0.5408, + "step": 7458 + }, + { + "epoch": 0.58, + "grad_norm": 1.2149208755788323, + "learning_rate": 7.952927030345836e-06, + "loss": 0.5383, + "step": 7459 + }, + { + "epoch": 0.58, + "grad_norm": 1.2376940083869323, + "learning_rate": 7.950467634159669e-06, + "loss": 0.6077, + "step": 7460 + }, + { + "epoch": 0.58, + "grad_norm": 1.34088503576488, + "learning_rate": 7.94800836737105e-06, + "loss": 0.5636, + "step": 7461 + }, + { + "epoch": 0.58, + "grad_norm": 1.1595330410814864, + "learning_rate": 7.945549230135251e-06, + "loss": 0.5838, + "step": 7462 + }, + { + "epoch": 0.58, + "grad_norm": 1.1809877215465043, + "learning_rate": 7.94309022260753e-06, + "loss": 0.5433, + "step": 7463 + }, + { + "epoch": 0.58, + "grad_norm": 1.119626989749777, + "learning_rate": 7.940631344943137e-06, + "loss": 0.4819, + "step": 7464 + }, + { + "epoch": 0.58, + "grad_norm": 1.2423304349607722, + "learning_rate": 7.93817259729731e-06, + "loss": 0.5268, + "step": 7465 + }, + { + "epoch": 0.58, + "grad_norm": 1.1871286614706904, + "learning_rate": 7.935713979825285e-06, + "loss": 0.5322, + "step": 7466 + }, + { + "epoch": 0.58, + "grad_norm": 1.2254199884536756, + "learning_rate": 7.933255492682287e-06, + "loss": 0.5376, + "step": 7467 + }, + { + "epoch": 0.58, + "grad_norm": 1.0613239736128324, + "learning_rate": 7.93079713602353e-06, + "loss": 0.4779, + "step": 7468 + }, + { + "epoch": 0.58, + "grad_norm": 1.1185703389062083, + "learning_rate": 7.92833891000423e-06, + "loss": 0.5153, + "step": 7469 + }, + { + "epoch": 0.58, + "grad_norm": 1.1915358175896347, + "learning_rate": 7.925880814779583e-06, + "loss": 0.5657, + "step": 7470 + }, + { + "epoch": 0.58, + "grad_norm": 1.0711968103154779, + "learning_rate": 7.923422850504777e-06, + "loss": 0.5555, + "step": 7471 + }, + { + "epoch": 0.58, + "grad_norm": 1.2811896612334384, + "learning_rate": 7.920965017335002e-06, + "loss": 0.5162, + "step": 7472 + }, + { + "epoch": 0.58, + "grad_norm": 1.2086162345584948, + "learning_rate": 7.918507315425432e-06, + "loss": 0.5665, + "step": 7473 + }, + { + "epoch": 0.58, + "grad_norm": 1.1265145279700568, + "learning_rate": 7.916049744931236e-06, + "loss": 0.5498, + "step": 7474 + }, + { + "epoch": 0.58, + "grad_norm": 1.1811039486164794, + "learning_rate": 7.91359230600757e-06, + "loss": 0.5302, + "step": 7475 + }, + { + "epoch": 0.58, + "grad_norm": 1.0706571594859717, + "learning_rate": 7.911134998809585e-06, + "loss": 0.521, + "step": 7476 + }, + { + "epoch": 0.58, + "grad_norm": 1.0910320298504563, + "learning_rate": 7.908677823492424e-06, + "loss": 0.5727, + "step": 7477 + }, + { + "epoch": 0.58, + "grad_norm": 1.159279694457198, + "learning_rate": 7.906220780211225e-06, + "loss": 0.5083, + "step": 7478 + }, + { + "epoch": 0.58, + "grad_norm": 1.2002011726552926, + "learning_rate": 7.90376386912111e-06, + "loss": 0.5253, + "step": 7479 + }, + { + "epoch": 0.58, + "grad_norm": 1.268780953165951, + "learning_rate": 7.901307090377197e-06, + "loss": 0.5538, + "step": 7480 + }, + { + "epoch": 0.58, + "grad_norm": 1.1283256859475594, + "learning_rate": 7.898850444134592e-06, + "loss": 0.5385, + "step": 7481 + }, + { + "epoch": 0.58, + "grad_norm": 1.1390025474982035, + "learning_rate": 7.8963939305484e-06, + "loss": 0.5325, + "step": 7482 + }, + { + "epoch": 0.58, + "grad_norm": 1.0603178841397687, + "learning_rate": 7.893937549773716e-06, + "loss": 0.5175, + "step": 7483 + }, + { + "epoch": 0.58, + "grad_norm": 1.0824843466774459, + "learning_rate": 7.891481301965618e-06, + "loss": 0.5412, + "step": 7484 + }, + { + "epoch": 0.58, + "grad_norm": 1.3265900661807983, + "learning_rate": 7.889025187279185e-06, + "loss": 0.5421, + "step": 7485 + }, + { + "epoch": 0.58, + "grad_norm": 1.2074165099761256, + "learning_rate": 7.886569205869481e-06, + "loss": 0.5717, + "step": 7486 + }, + { + "epoch": 0.58, + "grad_norm": 1.0868792395484372, + "learning_rate": 7.884113357891566e-06, + "loss": 0.5636, + "step": 7487 + }, + { + "epoch": 0.58, + "grad_norm": 1.2002539624254245, + "learning_rate": 7.881657643500495e-06, + "loss": 0.6008, + "step": 7488 + }, + { + "epoch": 0.58, + "grad_norm": 1.2799451277564378, + "learning_rate": 7.879202062851303e-06, + "loss": 0.5787, + "step": 7489 + }, + { + "epoch": 0.58, + "grad_norm": 1.1200855786121509, + "learning_rate": 7.876746616099031e-06, + "loss": 0.4872, + "step": 7490 + }, + { + "epoch": 0.58, + "grad_norm": 1.0010039535124273, + "learning_rate": 7.874291303398696e-06, + "loss": 0.5124, + "step": 7491 + }, + { + "epoch": 0.58, + "grad_norm": 1.03330788324661, + "learning_rate": 7.871836124905316e-06, + "loss": 0.4874, + "step": 7492 + }, + { + "epoch": 0.58, + "grad_norm": 1.1881740062706498, + "learning_rate": 7.869381080773906e-06, + "loss": 0.477, + "step": 7493 + }, + { + "epoch": 0.58, + "grad_norm": 1.1375168558066568, + "learning_rate": 7.866926171159458e-06, + "loss": 0.5021, + "step": 7494 + }, + { + "epoch": 0.58, + "grad_norm": 1.023417232400103, + "learning_rate": 7.86447139621697e-06, + "loss": 0.5129, + "step": 7495 + }, + { + "epoch": 0.58, + "grad_norm": 1.0473909387738125, + "learning_rate": 7.862016756101417e-06, + "loss": 0.5049, + "step": 7496 + }, + { + "epoch": 0.58, + "grad_norm": 1.2152867138246457, + "learning_rate": 7.859562250967776e-06, + "loss": 0.5431, + "step": 7497 + }, + { + "epoch": 0.58, + "grad_norm": 1.178171611238214, + "learning_rate": 7.857107880971015e-06, + "loss": 0.529, + "step": 7498 + }, + { + "epoch": 0.58, + "grad_norm": 1.1518408243591383, + "learning_rate": 7.854653646266089e-06, + "loss": 0.496, + "step": 7499 + }, + { + "epoch": 0.58, + "grad_norm": 1.138098596961056, + "learning_rate": 7.85219954700795e-06, + "loss": 0.5358, + "step": 7500 + }, + { + "epoch": 0.58, + "grad_norm": 1.1159691736516886, + "learning_rate": 7.84974558335153e-06, + "loss": 0.4809, + "step": 7501 + }, + { + "epoch": 0.58, + "grad_norm": 1.2103259603597454, + "learning_rate": 7.847291755451766e-06, + "loss": 0.5712, + "step": 7502 + }, + { + "epoch": 0.58, + "grad_norm": 1.2463382493036335, + "learning_rate": 7.844838063463582e-06, + "loss": 0.5461, + "step": 7503 + }, + { + "epoch": 0.58, + "grad_norm": 1.197938216748105, + "learning_rate": 7.842384507541889e-06, + "loss": 0.549, + "step": 7504 + }, + { + "epoch": 0.58, + "grad_norm": 1.303995005773564, + "learning_rate": 7.839931087841595e-06, + "loss": 0.569, + "step": 7505 + }, + { + "epoch": 0.58, + "grad_norm": 1.2191946734542334, + "learning_rate": 7.837477804517595e-06, + "loss": 0.5447, + "step": 7506 + }, + { + "epoch": 0.58, + "grad_norm": 1.103673440906869, + "learning_rate": 7.835024657724778e-06, + "loss": 0.5132, + "step": 7507 + }, + { + "epoch": 0.58, + "grad_norm": 1.2174770848117127, + "learning_rate": 7.832571647618024e-06, + "loss": 0.5735, + "step": 7508 + }, + { + "epoch": 0.58, + "grad_norm": 1.1819323471253134, + "learning_rate": 7.830118774352205e-06, + "loss": 0.513, + "step": 7509 + }, + { + "epoch": 0.58, + "grad_norm": 1.2180005238849785, + "learning_rate": 7.827666038082185e-06, + "loss": 0.5557, + "step": 7510 + }, + { + "epoch": 0.58, + "grad_norm": 1.1654775326795914, + "learning_rate": 7.825213438962812e-06, + "loss": 0.4963, + "step": 7511 + }, + { + "epoch": 0.58, + "grad_norm": 1.0044061032353246, + "learning_rate": 7.822760977148936e-06, + "loss": 0.4914, + "step": 7512 + }, + { + "epoch": 0.58, + "grad_norm": 1.105309734432696, + "learning_rate": 7.820308652795393e-06, + "loss": 0.5098, + "step": 7513 + }, + { + "epoch": 0.58, + "grad_norm": 1.0785365217672074, + "learning_rate": 7.81785646605701e-06, + "loss": 0.5751, + "step": 7514 + }, + { + "epoch": 0.58, + "grad_norm": 1.2183900937497338, + "learning_rate": 7.81540441708861e-06, + "loss": 0.4992, + "step": 7515 + }, + { + "epoch": 0.58, + "grad_norm": 1.2433800880463783, + "learning_rate": 7.812952506044996e-06, + "loss": 0.5612, + "step": 7516 + }, + { + "epoch": 0.58, + "grad_norm": 1.1851945884704642, + "learning_rate": 7.810500733080974e-06, + "loss": 0.5334, + "step": 7517 + }, + { + "epoch": 0.58, + "grad_norm": 1.103708868070359, + "learning_rate": 7.80804909835134e-06, + "loss": 0.5132, + "step": 7518 + }, + { + "epoch": 0.58, + "grad_norm": 1.2963949486997477, + "learning_rate": 7.805597602010873e-06, + "loss": 0.6022, + "step": 7519 + }, + { + "epoch": 0.58, + "grad_norm": 1.2372414829456624, + "learning_rate": 7.803146244214355e-06, + "loss": 0.567, + "step": 7520 + }, + { + "epoch": 0.58, + "grad_norm": 1.1047388369705629, + "learning_rate": 7.800695025116546e-06, + "loss": 0.4538, + "step": 7521 + }, + { + "epoch": 0.58, + "grad_norm": 1.1197263240231736, + "learning_rate": 7.798243944872204e-06, + "loss": 0.5379, + "step": 7522 + }, + { + "epoch": 0.58, + "grad_norm": 1.0979699063406962, + "learning_rate": 7.795793003636085e-06, + "loss": 0.5311, + "step": 7523 + }, + { + "epoch": 0.58, + "grad_norm": 1.167235025972854, + "learning_rate": 7.793342201562923e-06, + "loss": 0.5486, + "step": 7524 + }, + { + "epoch": 0.58, + "grad_norm": 1.1880808965452814, + "learning_rate": 7.790891538807459e-06, + "loss": 0.5442, + "step": 7525 + }, + { + "epoch": 0.58, + "grad_norm": 1.1404507125011358, + "learning_rate": 7.788441015524403e-06, + "loss": 0.5421, + "step": 7526 + }, + { + "epoch": 0.58, + "grad_norm": 1.1725054761518772, + "learning_rate": 7.785990631868478e-06, + "loss": 0.5171, + "step": 7527 + }, + { + "epoch": 0.58, + "grad_norm": 1.2991012400819353, + "learning_rate": 7.783540387994387e-06, + "loss": 0.5217, + "step": 7528 + }, + { + "epoch": 0.58, + "grad_norm": 1.1395736970557997, + "learning_rate": 7.781090284056827e-06, + "loss": 0.54, + "step": 7529 + }, + { + "epoch": 0.58, + "grad_norm": 1.1179872864493376, + "learning_rate": 7.778640320210487e-06, + "loss": 0.4949, + "step": 7530 + }, + { + "epoch": 0.58, + "grad_norm": 1.1500296423035712, + "learning_rate": 7.776190496610043e-06, + "loss": 0.5148, + "step": 7531 + }, + { + "epoch": 0.58, + "grad_norm": 1.2500664216514035, + "learning_rate": 7.773740813410165e-06, + "loss": 0.587, + "step": 7532 + }, + { + "epoch": 0.58, + "grad_norm": 1.1721832887130992, + "learning_rate": 7.771291270765518e-06, + "loss": 0.4816, + "step": 7533 + }, + { + "epoch": 0.58, + "grad_norm": 1.199138278311463, + "learning_rate": 7.768841868830747e-06, + "loss": 0.524, + "step": 7534 + }, + { + "epoch": 0.58, + "grad_norm": 1.2628818973011953, + "learning_rate": 7.766392607760508e-06, + "loss": 0.5892, + "step": 7535 + }, + { + "epoch": 0.58, + "grad_norm": 1.1327899799081569, + "learning_rate": 7.76394348770942e-06, + "loss": 0.4763, + "step": 7536 + }, + { + "epoch": 0.58, + "grad_norm": 1.068678618160993, + "learning_rate": 7.76149450883212e-06, + "loss": 0.5075, + "step": 7537 + }, + { + "epoch": 0.58, + "grad_norm": 1.143850443030654, + "learning_rate": 7.759045671283219e-06, + "loss": 0.5496, + "step": 7538 + }, + { + "epoch": 0.58, + "grad_norm": 1.1350588920203621, + "learning_rate": 7.756596975217327e-06, + "loss": 0.5346, + "step": 7539 + }, + { + "epoch": 0.58, + "grad_norm": 1.1705322201973778, + "learning_rate": 7.754148420789047e-06, + "loss": 0.5059, + "step": 7540 + }, + { + "epoch": 0.59, + "grad_norm": 1.2948119518177412, + "learning_rate": 7.751700008152959e-06, + "loss": 0.578, + "step": 7541 + }, + { + "epoch": 0.59, + "grad_norm": 1.0714561867561079, + "learning_rate": 7.749251737463648e-06, + "loss": 0.5123, + "step": 7542 + }, + { + "epoch": 0.59, + "grad_norm": 1.2774052503053566, + "learning_rate": 7.74680360887569e-06, + "loss": 0.5184, + "step": 7543 + }, + { + "epoch": 0.59, + "grad_norm": 1.1999482163541297, + "learning_rate": 7.744355622543643e-06, + "loss": 0.5446, + "step": 7544 + }, + { + "epoch": 0.59, + "grad_norm": 1.3017220517259611, + "learning_rate": 7.741907778622064e-06, + "loss": 0.578, + "step": 7545 + }, + { + "epoch": 0.59, + "grad_norm": 1.0748018104255457, + "learning_rate": 7.739460077265502e-06, + "loss": 0.514, + "step": 7546 + }, + { + "epoch": 0.59, + "grad_norm": 1.2338023245555976, + "learning_rate": 7.737012518628482e-06, + "loss": 0.4957, + "step": 7547 + }, + { + "epoch": 0.59, + "grad_norm": 1.218626602968466, + "learning_rate": 7.73456510286554e-06, + "loss": 0.5326, + "step": 7548 + }, + { + "epoch": 0.59, + "grad_norm": 1.3630973014133738, + "learning_rate": 7.732117830131189e-06, + "loss": 0.5607, + "step": 7549 + }, + { + "epoch": 0.59, + "grad_norm": 1.120810496205146, + "learning_rate": 7.72967070057994e-06, + "loss": 0.5164, + "step": 7550 + }, + { + "epoch": 0.59, + "grad_norm": 1.2480994558227565, + "learning_rate": 7.727223714366299e-06, + "loss": 0.553, + "step": 7551 + }, + { + "epoch": 0.59, + "grad_norm": 1.2262898524785526, + "learning_rate": 7.724776871644745e-06, + "loss": 0.5761, + "step": 7552 + }, + { + "epoch": 0.59, + "grad_norm": 1.1602232428837291, + "learning_rate": 7.722330172569767e-06, + "loss": 0.5497, + "step": 7553 + }, + { + "epoch": 0.59, + "grad_norm": 1.2029902890575266, + "learning_rate": 7.719883617295835e-06, + "loss": 0.5727, + "step": 7554 + }, + { + "epoch": 0.59, + "grad_norm": 1.2652610443875372, + "learning_rate": 7.717437205977414e-06, + "loss": 0.5182, + "step": 7555 + }, + { + "epoch": 0.59, + "grad_norm": 1.1500395415598474, + "learning_rate": 7.714990938768964e-06, + "loss": 0.5374, + "step": 7556 + }, + { + "epoch": 0.59, + "grad_norm": 1.1985369803906565, + "learning_rate": 7.712544815824921e-06, + "loss": 0.5478, + "step": 7557 + }, + { + "epoch": 0.59, + "grad_norm": 1.1611700284622768, + "learning_rate": 7.710098837299726e-06, + "loss": 0.5886, + "step": 7558 + }, + { + "epoch": 0.59, + "grad_norm": 1.1845211009396022, + "learning_rate": 7.707653003347805e-06, + "loss": 0.5464, + "step": 7559 + }, + { + "epoch": 0.59, + "grad_norm": 1.322771434871977, + "learning_rate": 7.705207314123577e-06, + "loss": 0.6593, + "step": 7560 + }, + { + "epoch": 0.59, + "grad_norm": 1.111559154986994, + "learning_rate": 7.702761769781454e-06, + "loss": 0.4898, + "step": 7561 + }, + { + "epoch": 0.59, + "grad_norm": 1.1570569392925638, + "learning_rate": 7.700316370475828e-06, + "loss": 0.4695, + "step": 7562 + }, + { + "epoch": 0.59, + "grad_norm": 1.2980350224071178, + "learning_rate": 7.697871116361099e-06, + "loss": 0.5757, + "step": 7563 + }, + { + "epoch": 0.59, + "grad_norm": 1.1888964122215295, + "learning_rate": 7.69542600759164e-06, + "loss": 0.5171, + "step": 7564 + }, + { + "epoch": 0.59, + "grad_norm": 1.1658745812969207, + "learning_rate": 7.692981044321826e-06, + "loss": 0.5855, + "step": 7565 + }, + { + "epoch": 0.59, + "grad_norm": 1.2530777234983987, + "learning_rate": 7.690536226706028e-06, + "loss": 0.5078, + "step": 7566 + }, + { + "epoch": 0.59, + "grad_norm": 1.2676092534879186, + "learning_rate": 7.688091554898587e-06, + "loss": 0.5521, + "step": 7567 + }, + { + "epoch": 0.59, + "grad_norm": 1.2287932599234257, + "learning_rate": 7.685647029053857e-06, + "loss": 0.5336, + "step": 7568 + }, + { + "epoch": 0.59, + "grad_norm": 1.2328067909949798, + "learning_rate": 7.683202649326169e-06, + "loss": 0.5133, + "step": 7569 + }, + { + "epoch": 0.59, + "grad_norm": 1.1685160499649005, + "learning_rate": 7.68075841586985e-06, + "loss": 0.5128, + "step": 7570 + }, + { + "epoch": 0.59, + "grad_norm": 1.1003401447128849, + "learning_rate": 7.678314328839223e-06, + "loss": 0.5109, + "step": 7571 + }, + { + "epoch": 0.59, + "grad_norm": 1.2123902005690965, + "learning_rate": 7.675870388388586e-06, + "loss": 0.5613, + "step": 7572 + }, + { + "epoch": 0.59, + "grad_norm": 1.1539914522631276, + "learning_rate": 7.673426594672243e-06, + "loss": 0.4736, + "step": 7573 + }, + { + "epoch": 0.59, + "grad_norm": 1.170372622973876, + "learning_rate": 7.670982947844482e-06, + "loss": 0.522, + "step": 7574 + }, + { + "epoch": 0.59, + "grad_norm": 1.0466722320501818, + "learning_rate": 7.668539448059585e-06, + "loss": 0.4948, + "step": 7575 + }, + { + "epoch": 0.59, + "grad_norm": 1.0414242971105694, + "learning_rate": 7.666096095471823e-06, + "loss": 0.4884, + "step": 7576 + }, + { + "epoch": 0.59, + "grad_norm": 1.192964429166772, + "learning_rate": 7.663652890235452e-06, + "loss": 0.5673, + "step": 7577 + }, + { + "epoch": 0.59, + "grad_norm": 1.1525280304021817, + "learning_rate": 7.661209832504731e-06, + "loss": 0.5482, + "step": 7578 + }, + { + "epoch": 0.59, + "grad_norm": 1.1793854181408703, + "learning_rate": 7.658766922433898e-06, + "loss": 0.499, + "step": 7579 + }, + { + "epoch": 0.59, + "grad_norm": 1.2662259311779778, + "learning_rate": 7.656324160177187e-06, + "loss": 0.52, + "step": 7580 + }, + { + "epoch": 0.59, + "grad_norm": 1.2434548205031937, + "learning_rate": 7.653881545888829e-06, + "loss": 0.5092, + "step": 7581 + }, + { + "epoch": 0.59, + "grad_norm": 1.2173877830410018, + "learning_rate": 7.65143907972303e-06, + "loss": 0.5567, + "step": 7582 + }, + { + "epoch": 0.59, + "grad_norm": 1.2011819978613083, + "learning_rate": 7.648996761834e-06, + "loss": 0.5432, + "step": 7583 + }, + { + "epoch": 0.59, + "grad_norm": 1.1359903945046632, + "learning_rate": 7.64655459237593e-06, + "loss": 0.5661, + "step": 7584 + }, + { + "epoch": 0.59, + "grad_norm": 1.2234511548835636, + "learning_rate": 7.644112571503014e-06, + "loss": 0.5574, + "step": 7585 + }, + { + "epoch": 0.59, + "grad_norm": 1.1583801443031645, + "learning_rate": 7.641670699369429e-06, + "loss": 0.5709, + "step": 7586 + }, + { + "epoch": 0.59, + "grad_norm": 1.1141114468943734, + "learning_rate": 7.639228976129337e-06, + "loss": 0.5244, + "step": 7587 + }, + { + "epoch": 0.59, + "grad_norm": 1.1627786199918313, + "learning_rate": 7.636787401936899e-06, + "loss": 0.5644, + "step": 7588 + }, + { + "epoch": 0.59, + "grad_norm": 1.124749897600938, + "learning_rate": 7.634345976946265e-06, + "loss": 0.5353, + "step": 7589 + }, + { + "epoch": 0.59, + "grad_norm": 1.0454095860942514, + "learning_rate": 7.631904701311574e-06, + "loss": 0.4875, + "step": 7590 + }, + { + "epoch": 0.59, + "grad_norm": 1.0700293953146631, + "learning_rate": 7.62946357518696e-06, + "loss": 0.5329, + "step": 7591 + }, + { + "epoch": 0.59, + "grad_norm": 1.1372642650099527, + "learning_rate": 7.627022598726539e-06, + "loss": 0.5555, + "step": 7592 + }, + { + "epoch": 0.59, + "grad_norm": 1.0988647758590586, + "learning_rate": 7.624581772084425e-06, + "loss": 0.4828, + "step": 7593 + }, + { + "epoch": 0.59, + "grad_norm": 1.1854360110688413, + "learning_rate": 7.622141095414717e-06, + "loss": 0.5528, + "step": 7594 + }, + { + "epoch": 0.59, + "grad_norm": 1.0399068399115785, + "learning_rate": 7.619700568871511e-06, + "loss": 0.4943, + "step": 7595 + }, + { + "epoch": 0.59, + "grad_norm": 1.2005507933656396, + "learning_rate": 7.617260192608892e-06, + "loss": 0.5032, + "step": 7596 + }, + { + "epoch": 0.59, + "grad_norm": 1.2502070732260793, + "learning_rate": 7.614819966780926e-06, + "loss": 0.5761, + "step": 7597 + }, + { + "epoch": 0.59, + "grad_norm": 1.2954981285502771, + "learning_rate": 7.6123798915416845e-06, + "loss": 0.5198, + "step": 7598 + }, + { + "epoch": 0.59, + "grad_norm": 1.1211739753967689, + "learning_rate": 7.609939967045217e-06, + "loss": 0.5431, + "step": 7599 + }, + { + "epoch": 0.59, + "grad_norm": 1.1132347766478696, + "learning_rate": 7.60750019344557e-06, + "loss": 0.5113, + "step": 7600 + }, + { + "epoch": 0.59, + "grad_norm": 1.0685703550461578, + "learning_rate": 7.605060570896781e-06, + "loss": 0.5003, + "step": 7601 + }, + { + "epoch": 0.59, + "grad_norm": 1.253303549358541, + "learning_rate": 7.602621099552874e-06, + "loss": 0.5236, + "step": 7602 + }, + { + "epoch": 0.59, + "grad_norm": 1.1137780937992867, + "learning_rate": 7.600181779567867e-06, + "loss": 0.5163, + "step": 7603 + }, + { + "epoch": 0.59, + "grad_norm": 1.2922049856790265, + "learning_rate": 7.597742611095762e-06, + "loss": 0.5713, + "step": 7604 + }, + { + "epoch": 0.59, + "grad_norm": 1.0113709194283491, + "learning_rate": 7.595303594290562e-06, + "loss": 0.509, + "step": 7605 + }, + { + "epoch": 0.59, + "grad_norm": 1.2947273395954355, + "learning_rate": 7.592864729306253e-06, + "loss": 0.5899, + "step": 7606 + }, + { + "epoch": 0.59, + "grad_norm": 1.1870841753871757, + "learning_rate": 7.5904260162968145e-06, + "loss": 0.5193, + "step": 7607 + }, + { + "epoch": 0.59, + "grad_norm": 1.249563808629943, + "learning_rate": 7.5879874554162124e-06, + "loss": 0.5446, + "step": 7608 + }, + { + "epoch": 0.59, + "grad_norm": 1.1636638598823918, + "learning_rate": 7.585549046818405e-06, + "loss": 0.5522, + "step": 7609 + }, + { + "epoch": 0.59, + "grad_norm": 1.1918571244386105, + "learning_rate": 7.583110790657342e-06, + "loss": 0.5686, + "step": 7610 + }, + { + "epoch": 0.59, + "grad_norm": 1.2336789353609325, + "learning_rate": 7.580672687086967e-06, + "loss": 0.5429, + "step": 7611 + }, + { + "epoch": 0.59, + "grad_norm": 1.1284992579946584, + "learning_rate": 7.578234736261208e-06, + "loss": 0.4427, + "step": 7612 + }, + { + "epoch": 0.59, + "grad_norm": 1.1799169658282695, + "learning_rate": 7.575796938333986e-06, + "loss": 0.5426, + "step": 7613 + }, + { + "epoch": 0.59, + "grad_norm": 1.1395142254746995, + "learning_rate": 7.573359293459206e-06, + "loss": 0.4955, + "step": 7614 + }, + { + "epoch": 0.59, + "grad_norm": 1.0797373008007733, + "learning_rate": 7.5709218017907734e-06, + "loss": 0.4721, + "step": 7615 + }, + { + "epoch": 0.59, + "grad_norm": 1.157151824211684, + "learning_rate": 7.568484463482584e-06, + "loss": 0.5152, + "step": 7616 + }, + { + "epoch": 0.59, + "grad_norm": 1.2051507318964612, + "learning_rate": 7.566047278688514e-06, + "loss": 0.5205, + "step": 7617 + }, + { + "epoch": 0.59, + "grad_norm": 1.1926528162577588, + "learning_rate": 7.563610247562437e-06, + "loss": 0.5057, + "step": 7618 + }, + { + "epoch": 0.59, + "grad_norm": 1.1477989964946236, + "learning_rate": 7.561173370258215e-06, + "loss": 0.5426, + "step": 7619 + }, + { + "epoch": 0.59, + "grad_norm": 1.160051231897204, + "learning_rate": 7.558736646929699e-06, + "loss": 0.5412, + "step": 7620 + }, + { + "epoch": 0.59, + "grad_norm": 1.2793791067470277, + "learning_rate": 7.556300077730735e-06, + "loss": 0.5875, + "step": 7621 + }, + { + "epoch": 0.59, + "grad_norm": 1.1494886816495486, + "learning_rate": 7.553863662815156e-06, + "loss": 0.5226, + "step": 7622 + }, + { + "epoch": 0.59, + "grad_norm": 1.1578831866021364, + "learning_rate": 7.551427402336784e-06, + "loss": 0.4992, + "step": 7623 + }, + { + "epoch": 0.59, + "grad_norm": 1.2472773942353286, + "learning_rate": 7.548991296449431e-06, + "loss": 0.547, + "step": 7624 + }, + { + "epoch": 0.59, + "grad_norm": 1.1410487707247865, + "learning_rate": 7.546555345306904e-06, + "loss": 0.5297, + "step": 7625 + }, + { + "epoch": 0.59, + "grad_norm": 1.2668888704418546, + "learning_rate": 7.544119549062998e-06, + "loss": 0.5275, + "step": 7626 + }, + { + "epoch": 0.59, + "grad_norm": 1.3520743663159378, + "learning_rate": 7.541683907871494e-06, + "loss": 0.5752, + "step": 7627 + }, + { + "epoch": 0.59, + "grad_norm": 1.1963845724787652, + "learning_rate": 7.539248421886169e-06, + "loss": 0.5619, + "step": 7628 + }, + { + "epoch": 0.59, + "grad_norm": 1.2333444566912413, + "learning_rate": 7.536813091260786e-06, + "loss": 0.5273, + "step": 7629 + }, + { + "epoch": 0.59, + "grad_norm": 1.1379288462365693, + "learning_rate": 7.5343779161491e-06, + "loss": 0.5302, + "step": 7630 + }, + { + "epoch": 0.59, + "grad_norm": 1.259515639940586, + "learning_rate": 7.531942896704859e-06, + "loss": 0.5402, + "step": 7631 + }, + { + "epoch": 0.59, + "grad_norm": 1.1645478126600004, + "learning_rate": 7.529508033081796e-06, + "loss": 0.5634, + "step": 7632 + }, + { + "epoch": 0.59, + "grad_norm": 1.154329870691558, + "learning_rate": 7.5270733254336385e-06, + "loss": 0.5316, + "step": 7633 + }, + { + "epoch": 0.59, + "grad_norm": 1.1067942298959335, + "learning_rate": 7.524638773914097e-06, + "loss": 0.5262, + "step": 7634 + }, + { + "epoch": 0.59, + "grad_norm": 1.1012922455979763, + "learning_rate": 7.522204378676882e-06, + "loss": 0.4972, + "step": 7635 + }, + { + "epoch": 0.59, + "grad_norm": 1.127135052202406, + "learning_rate": 7.5197701398756895e-06, + "loss": 0.5698, + "step": 7636 + }, + { + "epoch": 0.59, + "grad_norm": 1.0509799925318029, + "learning_rate": 7.517336057664204e-06, + "loss": 0.434, + "step": 7637 + }, + { + "epoch": 0.59, + "grad_norm": 1.13333891114564, + "learning_rate": 7.514902132196104e-06, + "loss": 0.4974, + "step": 7638 + }, + { + "epoch": 0.59, + "grad_norm": 1.1210752479010047, + "learning_rate": 7.5124683636250495e-06, + "loss": 0.4977, + "step": 7639 + }, + { + "epoch": 0.59, + "grad_norm": 1.102641125424338, + "learning_rate": 7.5100347521047025e-06, + "loss": 0.4417, + "step": 7640 + }, + { + "epoch": 0.59, + "grad_norm": 1.1227709091132043, + "learning_rate": 7.5076012977887095e-06, + "loss": 0.5072, + "step": 7641 + }, + { + "epoch": 0.59, + "grad_norm": 1.1782862950009478, + "learning_rate": 7.505168000830708e-06, + "loss": 0.5614, + "step": 7642 + }, + { + "epoch": 0.59, + "grad_norm": 1.1371494276985863, + "learning_rate": 7.502734861384319e-06, + "loss": 0.5376, + "step": 7643 + }, + { + "epoch": 0.59, + "grad_norm": 1.2617740678028007, + "learning_rate": 7.500301879603161e-06, + "loss": 0.5574, + "step": 7644 + }, + { + "epoch": 0.59, + "grad_norm": 1.2344276803279728, + "learning_rate": 7.497869055640843e-06, + "loss": 0.5284, + "step": 7645 + }, + { + "epoch": 0.59, + "grad_norm": 1.1017987694555291, + "learning_rate": 7.495436389650962e-06, + "loss": 0.5071, + "step": 7646 + }, + { + "epoch": 0.59, + "grad_norm": 1.2738314996164046, + "learning_rate": 7.493003881787106e-06, + "loss": 0.5595, + "step": 7647 + }, + { + "epoch": 0.59, + "grad_norm": 1.0699546382771965, + "learning_rate": 7.490571532202846e-06, + "loss": 0.4724, + "step": 7648 + }, + { + "epoch": 0.59, + "grad_norm": 1.2542344373893626, + "learning_rate": 7.488139341051751e-06, + "loss": 0.5412, + "step": 7649 + }, + { + "epoch": 0.59, + "grad_norm": 1.2142250472629428, + "learning_rate": 7.48570730848738e-06, + "loss": 0.5147, + "step": 7650 + }, + { + "epoch": 0.59, + "grad_norm": 1.1245022838360708, + "learning_rate": 7.48327543466328e-06, + "loss": 0.5037, + "step": 7651 + }, + { + "epoch": 0.59, + "grad_norm": 1.0819864091636586, + "learning_rate": 7.480843719732989e-06, + "loss": 0.4783, + "step": 7652 + }, + { + "epoch": 0.59, + "grad_norm": 1.2437783377021394, + "learning_rate": 7.478412163850026e-06, + "loss": 0.511, + "step": 7653 + }, + { + "epoch": 0.59, + "grad_norm": 1.213420758206791, + "learning_rate": 7.475980767167914e-06, + "loss": 0.5529, + "step": 7654 + }, + { + "epoch": 0.59, + "grad_norm": 1.102253690267413, + "learning_rate": 7.473549529840157e-06, + "loss": 0.4977, + "step": 7655 + }, + { + "epoch": 0.59, + "grad_norm": 1.2417763563430688, + "learning_rate": 7.471118452020256e-06, + "loss": 0.576, + "step": 7656 + }, + { + "epoch": 0.59, + "grad_norm": 1.2196627891123033, + "learning_rate": 7.4686875338616914e-06, + "loss": 0.5549, + "step": 7657 + }, + { + "epoch": 0.59, + "grad_norm": 1.1907378275920073, + "learning_rate": 7.466256775517948e-06, + "loss": 0.547, + "step": 7658 + }, + { + "epoch": 0.59, + "grad_norm": 1.1468677302241865, + "learning_rate": 7.463826177142483e-06, + "loss": 0.4931, + "step": 7659 + }, + { + "epoch": 0.59, + "grad_norm": 1.1262522721347055, + "learning_rate": 7.461395738888755e-06, + "loss": 0.4846, + "step": 7660 + }, + { + "epoch": 0.59, + "grad_norm": 1.1475780151375445, + "learning_rate": 7.458965460910214e-06, + "loss": 0.5383, + "step": 7661 + }, + { + "epoch": 0.59, + "grad_norm": 1.1554937209087068, + "learning_rate": 7.456535343360292e-06, + "loss": 0.5361, + "step": 7662 + }, + { + "epoch": 0.59, + "grad_norm": 1.028826727433479, + "learning_rate": 7.454105386392419e-06, + "loss": 0.4948, + "step": 7663 + }, + { + "epoch": 0.59, + "grad_norm": 1.1928663468393321, + "learning_rate": 7.451675590160006e-06, + "loss": 0.5492, + "step": 7664 + }, + { + "epoch": 0.59, + "grad_norm": 1.2017612407008635, + "learning_rate": 7.449245954816459e-06, + "loss": 0.5441, + "step": 7665 + }, + { + "epoch": 0.59, + "grad_norm": 1.0750432959420864, + "learning_rate": 7.446816480515177e-06, + "loss": 0.5057, + "step": 7666 + }, + { + "epoch": 0.59, + "grad_norm": 1.1238838593018492, + "learning_rate": 7.444387167409541e-06, + "loss": 0.4991, + "step": 7667 + }, + { + "epoch": 0.59, + "grad_norm": 1.2414968237895796, + "learning_rate": 7.441958015652934e-06, + "loss": 0.5342, + "step": 7668 + }, + { + "epoch": 0.59, + "grad_norm": 1.188644660590536, + "learning_rate": 7.4395290253987105e-06, + "loss": 0.5184, + "step": 7669 + }, + { + "epoch": 0.6, + "grad_norm": 1.170451762287841, + "learning_rate": 7.43710019680023e-06, + "loss": 0.5241, + "step": 7670 + }, + { + "epoch": 0.6, + "grad_norm": 1.2056543589290634, + "learning_rate": 7.434671530010839e-06, + "loss": 0.5559, + "step": 7671 + }, + { + "epoch": 0.6, + "grad_norm": 1.2776988524557864, + "learning_rate": 7.4322430251838674e-06, + "loss": 0.4768, + "step": 7672 + }, + { + "epoch": 0.6, + "grad_norm": 1.2197474286239363, + "learning_rate": 7.4298146824726445e-06, + "loss": 0.5505, + "step": 7673 + }, + { + "epoch": 0.6, + "grad_norm": 1.1540999656068396, + "learning_rate": 7.427386502030478e-06, + "loss": 0.4792, + "step": 7674 + }, + { + "epoch": 0.6, + "grad_norm": 1.1754700004776857, + "learning_rate": 7.424958484010675e-06, + "loss": 0.5453, + "step": 7675 + }, + { + "epoch": 0.6, + "grad_norm": 1.2249731956196317, + "learning_rate": 7.42253062856653e-06, + "loss": 0.526, + "step": 7676 + }, + { + "epoch": 0.6, + "grad_norm": 1.231689816069384, + "learning_rate": 7.420102935851324e-06, + "loss": 0.5738, + "step": 7677 + }, + { + "epoch": 0.6, + "grad_norm": 1.189048861251189, + "learning_rate": 7.417675406018332e-06, + "loss": 0.5025, + "step": 7678 + }, + { + "epoch": 0.6, + "grad_norm": 1.143413792375082, + "learning_rate": 7.4152480392208105e-06, + "loss": 0.5292, + "step": 7679 + }, + { + "epoch": 0.6, + "grad_norm": 1.170508032483189, + "learning_rate": 7.412820835612016e-06, + "loss": 0.4964, + "step": 7680 + }, + { + "epoch": 0.6, + "grad_norm": 1.1980959329676009, + "learning_rate": 7.410393795345193e-06, + "loss": 0.5273, + "step": 7681 + }, + { + "epoch": 0.6, + "grad_norm": 1.1215090630344111, + "learning_rate": 7.407966918573568e-06, + "loss": 0.5034, + "step": 7682 + }, + { + "epoch": 0.6, + "grad_norm": 1.1825853338763397, + "learning_rate": 7.40554020545037e-06, + "loss": 0.5244, + "step": 7683 + }, + { + "epoch": 0.6, + "grad_norm": 1.1462150920348926, + "learning_rate": 7.403113656128799e-06, + "loss": 0.5359, + "step": 7684 + }, + { + "epoch": 0.6, + "grad_norm": 1.1330780770317208, + "learning_rate": 7.400687270762061e-06, + "loss": 0.5165, + "step": 7685 + }, + { + "epoch": 0.6, + "grad_norm": 1.2432446569791638, + "learning_rate": 7.398261049503348e-06, + "loss": 0.5654, + "step": 7686 + }, + { + "epoch": 0.6, + "grad_norm": 1.188720678054079, + "learning_rate": 7.395834992505837e-06, + "loss": 0.5894, + "step": 7687 + }, + { + "epoch": 0.6, + "grad_norm": 1.203485707298391, + "learning_rate": 7.393409099922703e-06, + "loss": 0.5521, + "step": 7688 + }, + { + "epoch": 0.6, + "grad_norm": 1.3112134986211703, + "learning_rate": 7.390983371907095e-06, + "loss": 0.6046, + "step": 7689 + }, + { + "epoch": 0.6, + "grad_norm": 1.3880458016025612, + "learning_rate": 7.388557808612169e-06, + "loss": 0.6076, + "step": 7690 + }, + { + "epoch": 0.6, + "grad_norm": 1.1220474174427975, + "learning_rate": 7.3861324101910635e-06, + "loss": 0.5573, + "step": 7691 + }, + { + "epoch": 0.6, + "grad_norm": 1.2300879964132578, + "learning_rate": 7.383707176796903e-06, + "loss": 0.5501, + "step": 7692 + }, + { + "epoch": 0.6, + "grad_norm": 1.1069855545293743, + "learning_rate": 7.38128210858281e-06, + "loss": 0.5312, + "step": 7693 + }, + { + "epoch": 0.6, + "grad_norm": 1.0154173418642058, + "learning_rate": 7.378857205701885e-06, + "loss": 0.4921, + "step": 7694 + }, + { + "epoch": 0.6, + "grad_norm": 1.230866092529174, + "learning_rate": 7.376432468307227e-06, + "loss": 0.5444, + "step": 7695 + }, + { + "epoch": 0.6, + "grad_norm": 1.1537083191952524, + "learning_rate": 7.3740078965519266e-06, + "loss": 0.5298, + "step": 7696 + }, + { + "epoch": 0.6, + "grad_norm": 1.1363854198101881, + "learning_rate": 7.371583490589053e-06, + "loss": 0.5462, + "step": 7697 + }, + { + "epoch": 0.6, + "grad_norm": 1.105611569510098, + "learning_rate": 7.369159250571678e-06, + "loss": 0.5163, + "step": 7698 + }, + { + "epoch": 0.6, + "grad_norm": 1.089266661850344, + "learning_rate": 7.3667351766528505e-06, + "loss": 0.5066, + "step": 7699 + }, + { + "epoch": 0.6, + "grad_norm": 1.076758140205487, + "learning_rate": 7.364311268985616e-06, + "loss": 0.5309, + "step": 7700 + }, + { + "epoch": 0.6, + "grad_norm": 1.1024403422297548, + "learning_rate": 7.36188752772301e-06, + "loss": 0.4984, + "step": 7701 + }, + { + "epoch": 0.6, + "grad_norm": 1.2113851396661608, + "learning_rate": 7.359463953018056e-06, + "loss": 0.4815, + "step": 7702 + }, + { + "epoch": 0.6, + "grad_norm": 1.2617883810664767, + "learning_rate": 7.357040545023768e-06, + "loss": 0.5286, + "step": 7703 + }, + { + "epoch": 0.6, + "grad_norm": 1.1882840629910512, + "learning_rate": 7.354617303893143e-06, + "loss": 0.5431, + "step": 7704 + }, + { + "epoch": 0.6, + "grad_norm": 1.17086377064702, + "learning_rate": 7.352194229779177e-06, + "loss": 0.5774, + "step": 7705 + }, + { + "epoch": 0.6, + "grad_norm": 1.2108643048222572, + "learning_rate": 7.349771322834851e-06, + "loss": 0.5097, + "step": 7706 + }, + { + "epoch": 0.6, + "grad_norm": 1.096815445633583, + "learning_rate": 7.347348583213133e-06, + "loss": 0.5229, + "step": 7707 + }, + { + "epoch": 0.6, + "grad_norm": 1.146123046084517, + "learning_rate": 7.344926011066985e-06, + "loss": 0.4909, + "step": 7708 + }, + { + "epoch": 0.6, + "grad_norm": 1.1098479216587311, + "learning_rate": 7.34250360654936e-06, + "loss": 0.5004, + "step": 7709 + }, + { + "epoch": 0.6, + "grad_norm": 1.1521967907317086, + "learning_rate": 7.3400813698131905e-06, + "loss": 0.587, + "step": 7710 + }, + { + "epoch": 0.6, + "grad_norm": 1.1912626086085143, + "learning_rate": 7.337659301011408e-06, + "loss": 0.5536, + "step": 7711 + }, + { + "epoch": 0.6, + "grad_norm": 1.2622791380364655, + "learning_rate": 7.335237400296927e-06, + "loss": 0.5728, + "step": 7712 + }, + { + "epoch": 0.6, + "grad_norm": 1.1782423350543982, + "learning_rate": 7.332815667822659e-06, + "loss": 0.6221, + "step": 7713 + }, + { + "epoch": 0.6, + "grad_norm": 1.2107529530336583, + "learning_rate": 7.3303941037415024e-06, + "loss": 0.5269, + "step": 7714 + }, + { + "epoch": 0.6, + "grad_norm": 1.2087518964592812, + "learning_rate": 7.327972708206334e-06, + "loss": 0.5397, + "step": 7715 + }, + { + "epoch": 0.6, + "grad_norm": 1.237342936249032, + "learning_rate": 7.325551481370038e-06, + "loss": 0.5233, + "step": 7716 + }, + { + "epoch": 0.6, + "grad_norm": 1.168068769886466, + "learning_rate": 7.323130423385473e-06, + "loss": 0.5806, + "step": 7717 + }, + { + "epoch": 0.6, + "grad_norm": 1.1706644545065703, + "learning_rate": 7.320709534405494e-06, + "loss": 0.4631, + "step": 7718 + }, + { + "epoch": 0.6, + "grad_norm": 1.2245851845914941, + "learning_rate": 7.318288814582951e-06, + "loss": 0.506, + "step": 7719 + }, + { + "epoch": 0.6, + "grad_norm": 1.1291926987428518, + "learning_rate": 7.315868264070667e-06, + "loss": 0.5415, + "step": 7720 + }, + { + "epoch": 0.6, + "grad_norm": 1.1108798713598735, + "learning_rate": 7.313447883021469e-06, + "loss": 0.5213, + "step": 7721 + }, + { + "epoch": 0.6, + "grad_norm": 1.2668293532667123, + "learning_rate": 7.3110276715881645e-06, + "loss": 0.5411, + "step": 7722 + }, + { + "epoch": 0.6, + "grad_norm": 1.316964277401386, + "learning_rate": 7.308607629923557e-06, + "loss": 0.5327, + "step": 7723 + }, + { + "epoch": 0.6, + "grad_norm": 1.0794143706686312, + "learning_rate": 7.3061877581804395e-06, + "loss": 0.563, + "step": 7724 + }, + { + "epoch": 0.6, + "grad_norm": 1.2034152783066068, + "learning_rate": 7.303768056511583e-06, + "loss": 0.5531, + "step": 7725 + }, + { + "epoch": 0.6, + "grad_norm": 1.272269479747504, + "learning_rate": 7.301348525069762e-06, + "loss": 0.5658, + "step": 7726 + }, + { + "epoch": 0.6, + "grad_norm": 1.2227362463377844, + "learning_rate": 7.29892916400773e-06, + "loss": 0.49, + "step": 7727 + }, + { + "epoch": 0.6, + "grad_norm": 1.136811504971153, + "learning_rate": 7.296509973478235e-06, + "loss": 0.4989, + "step": 7728 + }, + { + "epoch": 0.6, + "grad_norm": 1.2317801616979487, + "learning_rate": 7.2940909536340184e-06, + "loss": 0.5197, + "step": 7729 + }, + { + "epoch": 0.6, + "grad_norm": 1.1958420521665303, + "learning_rate": 7.291672104627796e-06, + "loss": 0.531, + "step": 7730 + }, + { + "epoch": 0.6, + "grad_norm": 1.1883224349750479, + "learning_rate": 7.289253426612288e-06, + "loss": 0.5429, + "step": 7731 + }, + { + "epoch": 0.6, + "grad_norm": 1.1940057131121007, + "learning_rate": 7.286834919740195e-06, + "loss": 0.5843, + "step": 7732 + }, + { + "epoch": 0.6, + "grad_norm": 1.1652537657262145, + "learning_rate": 7.2844165841642135e-06, + "loss": 0.5263, + "step": 7733 + }, + { + "epoch": 0.6, + "grad_norm": 1.1133999945238846, + "learning_rate": 7.281998420037026e-06, + "loss": 0.5187, + "step": 7734 + }, + { + "epoch": 0.6, + "grad_norm": 1.178772931811347, + "learning_rate": 7.279580427511297e-06, + "loss": 0.5398, + "step": 7735 + }, + { + "epoch": 0.6, + "grad_norm": 1.156470200474254, + "learning_rate": 7.277162606739694e-06, + "loss": 0.4845, + "step": 7736 + }, + { + "epoch": 0.6, + "grad_norm": 1.2183695957529324, + "learning_rate": 7.274744957874862e-06, + "loss": 0.5565, + "step": 7737 + }, + { + "epoch": 0.6, + "grad_norm": 1.1619184064631618, + "learning_rate": 7.27232748106944e-06, + "loss": 0.4968, + "step": 7738 + }, + { + "epoch": 0.6, + "grad_norm": 1.2777065030320225, + "learning_rate": 7.269910176476062e-06, + "loss": 0.5697, + "step": 7739 + }, + { + "epoch": 0.6, + "grad_norm": 1.0787274985325965, + "learning_rate": 7.267493044247338e-06, + "loss": 0.5213, + "step": 7740 + }, + { + "epoch": 0.6, + "grad_norm": 1.240664478125496, + "learning_rate": 7.265076084535874e-06, + "loss": 0.5802, + "step": 7741 + }, + { + "epoch": 0.6, + "grad_norm": 1.1855375738894567, + "learning_rate": 7.262659297494266e-06, + "loss": 0.497, + "step": 7742 + }, + { + "epoch": 0.6, + "grad_norm": 1.1670912696562183, + "learning_rate": 7.2602426832751005e-06, + "loss": 0.4886, + "step": 7743 + }, + { + "epoch": 0.6, + "grad_norm": 1.2025837857565378, + "learning_rate": 7.2578262420309534e-06, + "loss": 0.5525, + "step": 7744 + }, + { + "epoch": 0.6, + "grad_norm": 1.168066371550638, + "learning_rate": 7.25540997391438e-06, + "loss": 0.5167, + "step": 7745 + }, + { + "epoch": 0.6, + "grad_norm": 1.1812902514346169, + "learning_rate": 7.252993879077936e-06, + "loss": 0.5359, + "step": 7746 + }, + { + "epoch": 0.6, + "grad_norm": 1.1309621497380071, + "learning_rate": 7.250577957674159e-06, + "loss": 0.4906, + "step": 7747 + }, + { + "epoch": 0.6, + "grad_norm": 1.1578495714685226, + "learning_rate": 7.24816220985558e-06, + "loss": 0.4922, + "step": 7748 + }, + { + "epoch": 0.6, + "grad_norm": 1.1195265286033935, + "learning_rate": 7.245746635774724e-06, + "loss": 0.4856, + "step": 7749 + }, + { + "epoch": 0.6, + "grad_norm": 1.2022404577392567, + "learning_rate": 7.2433312355840916e-06, + "loss": 0.5518, + "step": 7750 + }, + { + "epoch": 0.6, + "grad_norm": 1.1430677098621624, + "learning_rate": 7.2409160094361774e-06, + "loss": 0.5409, + "step": 7751 + }, + { + "epoch": 0.6, + "grad_norm": 1.2245437142079847, + "learning_rate": 7.2385009574834705e-06, + "loss": 0.5524, + "step": 7752 + }, + { + "epoch": 0.6, + "grad_norm": 1.120899196849831, + "learning_rate": 7.236086079878446e-06, + "loss": 0.4832, + "step": 7753 + }, + { + "epoch": 0.6, + "grad_norm": 1.1531149142684882, + "learning_rate": 7.2336713767735725e-06, + "loss": 0.5234, + "step": 7754 + }, + { + "epoch": 0.6, + "grad_norm": 1.161670095300928, + "learning_rate": 7.231256848321293e-06, + "loss": 0.4833, + "step": 7755 + }, + { + "epoch": 0.6, + "grad_norm": 1.1041531771909487, + "learning_rate": 7.228842494674053e-06, + "loss": 0.4979, + "step": 7756 + }, + { + "epoch": 0.6, + "grad_norm": 1.1242680287644997, + "learning_rate": 7.226428315984283e-06, + "loss": 0.5051, + "step": 7757 + }, + { + "epoch": 0.6, + "grad_norm": 1.2279510413670842, + "learning_rate": 7.224014312404404e-06, + "loss": 0.5773, + "step": 7758 + }, + { + "epoch": 0.6, + "grad_norm": 1.1478084476273744, + "learning_rate": 7.221600484086826e-06, + "loss": 0.5261, + "step": 7759 + }, + { + "epoch": 0.6, + "grad_norm": 1.1540163993562722, + "learning_rate": 7.219186831183944e-06, + "loss": 0.5333, + "step": 7760 + }, + { + "epoch": 0.6, + "grad_norm": 1.2380839771328047, + "learning_rate": 7.216773353848141e-06, + "loss": 0.522, + "step": 7761 + }, + { + "epoch": 0.6, + "grad_norm": 1.1390615748439636, + "learning_rate": 7.214360052231797e-06, + "loss": 0.4815, + "step": 7762 + }, + { + "epoch": 0.6, + "grad_norm": 1.103024101505859, + "learning_rate": 7.211946926487274e-06, + "loss": 0.5089, + "step": 7763 + }, + { + "epoch": 0.6, + "grad_norm": 1.0285385784133456, + "learning_rate": 7.209533976766928e-06, + "loss": 0.5471, + "step": 7764 + }, + { + "epoch": 0.6, + "grad_norm": 1.1227130957794027, + "learning_rate": 7.207121203223102e-06, + "loss": 0.5557, + "step": 7765 + }, + { + "epoch": 0.6, + "grad_norm": 1.1430226561636287, + "learning_rate": 7.204708606008119e-06, + "loss": 0.5567, + "step": 7766 + }, + { + "epoch": 0.6, + "grad_norm": 1.1506392982502094, + "learning_rate": 7.202296185274302e-06, + "loss": 0.4934, + "step": 7767 + }, + { + "epoch": 0.6, + "grad_norm": 1.0627155085395406, + "learning_rate": 7.199883941173963e-06, + "loss": 0.5273, + "step": 7768 + }, + { + "epoch": 0.6, + "grad_norm": 1.1614702791474936, + "learning_rate": 7.197471873859399e-06, + "loss": 0.5168, + "step": 7769 + }, + { + "epoch": 0.6, + "grad_norm": 1.2146122457335764, + "learning_rate": 7.195059983482897e-06, + "loss": 0.5171, + "step": 7770 + }, + { + "epoch": 0.6, + "grad_norm": 1.1708842349046111, + "learning_rate": 7.192648270196726e-06, + "loss": 0.5207, + "step": 7771 + }, + { + "epoch": 0.6, + "grad_norm": 1.1807842083074005, + "learning_rate": 7.1902367341531536e-06, + "loss": 0.5832, + "step": 7772 + }, + { + "epoch": 0.6, + "grad_norm": 1.245341202274705, + "learning_rate": 7.187825375504431e-06, + "loss": 0.542, + "step": 7773 + }, + { + "epoch": 0.6, + "grad_norm": 1.217804811221884, + "learning_rate": 7.185414194402805e-06, + "loss": 0.569, + "step": 7774 + }, + { + "epoch": 0.6, + "grad_norm": 1.2187131729430711, + "learning_rate": 7.183003191000505e-06, + "loss": 0.5518, + "step": 7775 + }, + { + "epoch": 0.6, + "grad_norm": 1.2335917728304344, + "learning_rate": 7.180592365449742e-06, + "loss": 0.5684, + "step": 7776 + }, + { + "epoch": 0.6, + "grad_norm": 1.1324912306254484, + "learning_rate": 7.17818171790273e-06, + "loss": 0.4846, + "step": 7777 + }, + { + "epoch": 0.6, + "grad_norm": 1.1520112673321037, + "learning_rate": 7.175771248511664e-06, + "loss": 0.5322, + "step": 7778 + }, + { + "epoch": 0.6, + "grad_norm": 1.2145664597784722, + "learning_rate": 7.173360957428732e-06, + "loss": 0.5466, + "step": 7779 + }, + { + "epoch": 0.6, + "grad_norm": 1.2196088845604676, + "learning_rate": 7.170950844806109e-06, + "loss": 0.5069, + "step": 7780 + }, + { + "epoch": 0.6, + "grad_norm": 1.1349496610124286, + "learning_rate": 7.168540910795951e-06, + "loss": 0.5332, + "step": 7781 + }, + { + "epoch": 0.6, + "grad_norm": 1.1940255311475914, + "learning_rate": 7.166131155550414e-06, + "loss": 0.5538, + "step": 7782 + }, + { + "epoch": 0.6, + "grad_norm": 1.190497279888636, + "learning_rate": 7.163721579221638e-06, + "loss": 0.5228, + "step": 7783 + }, + { + "epoch": 0.6, + "grad_norm": 1.0679824415220855, + "learning_rate": 7.161312181961754e-06, + "loss": 0.4807, + "step": 7784 + }, + { + "epoch": 0.6, + "grad_norm": 1.1247656366160539, + "learning_rate": 7.158902963922879e-06, + "loss": 0.5591, + "step": 7785 + }, + { + "epoch": 0.6, + "grad_norm": 1.1692783834076288, + "learning_rate": 7.156493925257114e-06, + "loss": 0.5372, + "step": 7786 + }, + { + "epoch": 0.6, + "grad_norm": 1.2981125315920066, + "learning_rate": 7.154085066116558e-06, + "loss": 0.5805, + "step": 7787 + }, + { + "epoch": 0.6, + "grad_norm": 1.2085272151137993, + "learning_rate": 7.1516763866532955e-06, + "loss": 0.531, + "step": 7788 + }, + { + "epoch": 0.6, + "grad_norm": 1.0897060840576522, + "learning_rate": 7.1492678870194e-06, + "loss": 0.4923, + "step": 7789 + }, + { + "epoch": 0.6, + "grad_norm": 1.1414289384763234, + "learning_rate": 7.146859567366933e-06, + "loss": 0.4814, + "step": 7790 + }, + { + "epoch": 0.6, + "grad_norm": 1.1963225441743155, + "learning_rate": 7.144451427847937e-06, + "loss": 0.5589, + "step": 7791 + }, + { + "epoch": 0.6, + "grad_norm": 1.1827548739909985, + "learning_rate": 7.142043468614455e-06, + "loss": 0.5394, + "step": 7792 + }, + { + "epoch": 0.6, + "grad_norm": 1.1961650924286023, + "learning_rate": 7.139635689818514e-06, + "loss": 0.5794, + "step": 7793 + }, + { + "epoch": 0.6, + "grad_norm": 1.1906948279382121, + "learning_rate": 7.137228091612132e-06, + "loss": 0.5365, + "step": 7794 + }, + { + "epoch": 0.6, + "grad_norm": 1.090560677279326, + "learning_rate": 7.134820674147313e-06, + "loss": 0.5128, + "step": 7795 + }, + { + "epoch": 0.6, + "grad_norm": 1.149568792022806, + "learning_rate": 7.132413437576043e-06, + "loss": 0.5389, + "step": 7796 + }, + { + "epoch": 0.6, + "grad_norm": 1.202005484763658, + "learning_rate": 7.130006382050305e-06, + "loss": 0.4747, + "step": 7797 + }, + { + "epoch": 0.6, + "grad_norm": 1.1965656069866897, + "learning_rate": 7.127599507722074e-06, + "loss": 0.5338, + "step": 7798 + }, + { + "epoch": 0.61, + "grad_norm": 1.3071009077536964, + "learning_rate": 7.1251928147433065e-06, + "loss": 0.5717, + "step": 7799 + }, + { + "epoch": 0.61, + "grad_norm": 1.0713810842072236, + "learning_rate": 7.122786303265952e-06, + "loss": 0.4685, + "step": 7800 + }, + { + "epoch": 0.61, + "grad_norm": 1.2521506405139966, + "learning_rate": 7.1203799734419375e-06, + "loss": 0.5251, + "step": 7801 + }, + { + "epoch": 0.61, + "grad_norm": 1.140593907180692, + "learning_rate": 7.117973825423193e-06, + "loss": 0.5511, + "step": 7802 + }, + { + "epoch": 0.61, + "grad_norm": 1.2983479695031779, + "learning_rate": 7.115567859361629e-06, + "loss": 0.5779, + "step": 7803 + }, + { + "epoch": 0.61, + "grad_norm": 1.2454885131048963, + "learning_rate": 7.1131620754091505e-06, + "loss": 0.5042, + "step": 7804 + }, + { + "epoch": 0.61, + "grad_norm": 1.321252928110738, + "learning_rate": 7.110756473717646e-06, + "loss": 0.5228, + "step": 7805 + }, + { + "epoch": 0.61, + "grad_norm": 1.13213143757669, + "learning_rate": 7.108351054438988e-06, + "loss": 0.5296, + "step": 7806 + }, + { + "epoch": 0.61, + "grad_norm": 1.249380101035593, + "learning_rate": 7.105945817725046e-06, + "loss": 0.4867, + "step": 7807 + }, + { + "epoch": 0.61, + "grad_norm": 1.1128109658071885, + "learning_rate": 7.103540763727678e-06, + "loss": 0.5154, + "step": 7808 + }, + { + "epoch": 0.61, + "grad_norm": 1.1646007343191, + "learning_rate": 7.101135892598725e-06, + "loss": 0.5527, + "step": 7809 + }, + { + "epoch": 0.61, + "grad_norm": 1.2326544347269293, + "learning_rate": 7.098731204490023e-06, + "loss": 0.5327, + "step": 7810 + }, + { + "epoch": 0.61, + "grad_norm": 1.1507014063673973, + "learning_rate": 7.096326699553384e-06, + "loss": 0.5387, + "step": 7811 + }, + { + "epoch": 0.61, + "grad_norm": 1.233105646630224, + "learning_rate": 7.093922377940622e-06, + "loss": 0.5323, + "step": 7812 + }, + { + "epoch": 0.61, + "grad_norm": 1.1774480417859856, + "learning_rate": 7.091518239803533e-06, + "loss": 0.5185, + "step": 7813 + }, + { + "epoch": 0.61, + "grad_norm": 1.2019897653712068, + "learning_rate": 7.089114285293906e-06, + "loss": 0.5744, + "step": 7814 + }, + { + "epoch": 0.61, + "grad_norm": 1.1609124182228903, + "learning_rate": 7.086710514563513e-06, + "loss": 0.5477, + "step": 7815 + }, + { + "epoch": 0.61, + "grad_norm": 1.1763191386871017, + "learning_rate": 7.084306927764113e-06, + "loss": 0.5639, + "step": 7816 + }, + { + "epoch": 0.61, + "grad_norm": 1.2453669518181323, + "learning_rate": 7.08190352504746e-06, + "loss": 0.545, + "step": 7817 + }, + { + "epoch": 0.61, + "grad_norm": 1.2543348013799753, + "learning_rate": 7.079500306565291e-06, + "loss": 0.5758, + "step": 7818 + }, + { + "epoch": 0.61, + "grad_norm": 1.1122521177868907, + "learning_rate": 7.07709727246934e-06, + "loss": 0.5454, + "step": 7819 + }, + { + "epoch": 0.61, + "grad_norm": 1.2221274405391953, + "learning_rate": 7.074694422911315e-06, + "loss": 0.5464, + "step": 7820 + }, + { + "epoch": 0.61, + "grad_norm": 1.19652939231342, + "learning_rate": 7.072291758042926e-06, + "loss": 0.5225, + "step": 7821 + }, + { + "epoch": 0.61, + "grad_norm": 1.1074683769998526, + "learning_rate": 7.0698892780158625e-06, + "loss": 0.5044, + "step": 7822 + }, + { + "epoch": 0.61, + "grad_norm": 1.1954839246906357, + "learning_rate": 7.067486982981804e-06, + "loss": 0.5531, + "step": 7823 + }, + { + "epoch": 0.61, + "grad_norm": 1.0720141173187567, + "learning_rate": 7.065084873092425e-06, + "loss": 0.517, + "step": 7824 + }, + { + "epoch": 0.61, + "grad_norm": 1.2354404824562912, + "learning_rate": 7.062682948499377e-06, + "loss": 0.5565, + "step": 7825 + }, + { + "epoch": 0.61, + "grad_norm": 1.254465234055037, + "learning_rate": 7.060281209354315e-06, + "loss": 0.5384, + "step": 7826 + }, + { + "epoch": 0.61, + "grad_norm": 1.2194844502688962, + "learning_rate": 7.057879655808862e-06, + "loss": 0.5459, + "step": 7827 + }, + { + "epoch": 0.61, + "grad_norm": 1.2146731437774914, + "learning_rate": 7.055478288014646e-06, + "loss": 0.5148, + "step": 7828 + }, + { + "epoch": 0.61, + "grad_norm": 1.1701081753305558, + "learning_rate": 7.053077106123278e-06, + "loss": 0.5894, + "step": 7829 + }, + { + "epoch": 0.61, + "grad_norm": 1.2052994931275567, + "learning_rate": 7.0506761102863565e-06, + "loss": 0.5449, + "step": 7830 + }, + { + "epoch": 0.61, + "grad_norm": 1.196580999136246, + "learning_rate": 7.048275300655472e-06, + "loss": 0.521, + "step": 7831 + }, + { + "epoch": 0.61, + "grad_norm": 1.13749348565217, + "learning_rate": 7.0458746773821915e-06, + "loss": 0.514, + "step": 7832 + }, + { + "epoch": 0.61, + "grad_norm": 1.1666929491806517, + "learning_rate": 7.043474240618086e-06, + "loss": 0.5396, + "step": 7833 + }, + { + "epoch": 0.61, + "grad_norm": 1.0910625683948953, + "learning_rate": 7.0410739905147065e-06, + "loss": 0.5458, + "step": 7834 + }, + { + "epoch": 0.61, + "grad_norm": 1.204570545897053, + "learning_rate": 7.03867392722359e-06, + "loss": 0.5585, + "step": 7835 + }, + { + "epoch": 0.61, + "grad_norm": 1.2072295979608385, + "learning_rate": 7.0362740508962705e-06, + "loss": 0.541, + "step": 7836 + }, + { + "epoch": 0.61, + "grad_norm": 1.0771549670798806, + "learning_rate": 7.033874361684256e-06, + "loss": 0.51, + "step": 7837 + }, + { + "epoch": 0.61, + "grad_norm": 1.1567993405588204, + "learning_rate": 7.031474859739058e-06, + "loss": 0.5428, + "step": 7838 + }, + { + "epoch": 0.61, + "grad_norm": 1.1498365638530126, + "learning_rate": 7.02907554521217e-06, + "loss": 0.5344, + "step": 7839 + }, + { + "epoch": 0.61, + "grad_norm": 1.0391374718715964, + "learning_rate": 7.026676418255068e-06, + "loss": 0.5073, + "step": 7840 + }, + { + "epoch": 0.61, + "grad_norm": 1.3187526540141947, + "learning_rate": 7.024277479019227e-06, + "loss": 0.6021, + "step": 7841 + }, + { + "epoch": 0.61, + "grad_norm": 1.2710415351707958, + "learning_rate": 7.021878727656097e-06, + "loss": 0.5852, + "step": 7842 + }, + { + "epoch": 0.61, + "grad_norm": 1.2183561422262938, + "learning_rate": 7.0194801643171324e-06, + "loss": 0.5688, + "step": 7843 + }, + { + "epoch": 0.61, + "grad_norm": 1.0695383655431216, + "learning_rate": 7.017081789153759e-06, + "loss": 0.5091, + "step": 7844 + }, + { + "epoch": 0.61, + "grad_norm": 1.1687122644870616, + "learning_rate": 7.014683602317402e-06, + "loss": 0.5178, + "step": 7845 + }, + { + "epoch": 0.61, + "grad_norm": 1.0709712653379195, + "learning_rate": 7.012285603959477e-06, + "loss": 0.4918, + "step": 7846 + }, + { + "epoch": 0.61, + "grad_norm": 1.1266700217142505, + "learning_rate": 7.009887794231371e-06, + "loss": 0.5317, + "step": 7847 + }, + { + "epoch": 0.61, + "grad_norm": 1.191408151093514, + "learning_rate": 7.0074901732844795e-06, + "loss": 0.5349, + "step": 7848 + }, + { + "epoch": 0.61, + "grad_norm": 1.0818673570915007, + "learning_rate": 7.005092741270168e-06, + "loss": 0.4733, + "step": 7849 + }, + { + "epoch": 0.61, + "grad_norm": 1.1800136494251199, + "learning_rate": 7.0026954983398045e-06, + "loss": 0.5287, + "step": 7850 + }, + { + "epoch": 0.61, + "grad_norm": 1.2905621677279835, + "learning_rate": 7.0002984446447416e-06, + "loss": 0.5673, + "step": 7851 + }, + { + "epoch": 0.61, + "grad_norm": 1.1364672405524396, + "learning_rate": 6.997901580336312e-06, + "loss": 0.5333, + "step": 7852 + }, + { + "epoch": 0.61, + "grad_norm": 1.1678508584222989, + "learning_rate": 6.995504905565844e-06, + "loss": 0.5066, + "step": 7853 + }, + { + "epoch": 0.61, + "grad_norm": 1.0731681541337377, + "learning_rate": 6.993108420484652e-06, + "loss": 0.4843, + "step": 7854 + }, + { + "epoch": 0.61, + "grad_norm": 1.1146248723545837, + "learning_rate": 6.990712125244039e-06, + "loss": 0.5617, + "step": 7855 + }, + { + "epoch": 0.61, + "grad_norm": 1.2610995541872534, + "learning_rate": 6.9883160199952996e-06, + "loss": 0.5487, + "step": 7856 + }, + { + "epoch": 0.61, + "grad_norm": 1.1544294200517597, + "learning_rate": 6.985920104889703e-06, + "loss": 0.5068, + "step": 7857 + }, + { + "epoch": 0.61, + "grad_norm": 1.4344190669718466, + "learning_rate": 6.983524380078523e-06, + "loss": 0.5083, + "step": 7858 + }, + { + "epoch": 0.61, + "grad_norm": 1.167468675197255, + "learning_rate": 6.9811288457130115e-06, + "loss": 0.5143, + "step": 7859 + }, + { + "epoch": 0.61, + "grad_norm": 1.2997469985965413, + "learning_rate": 6.9787335019444105e-06, + "loss": 0.5593, + "step": 7860 + }, + { + "epoch": 0.61, + "grad_norm": 1.2287374760095242, + "learning_rate": 6.976338348923955e-06, + "loss": 0.5573, + "step": 7861 + }, + { + "epoch": 0.61, + "grad_norm": 1.194006911189079, + "learning_rate": 6.973943386802855e-06, + "loss": 0.4866, + "step": 7862 + }, + { + "epoch": 0.61, + "grad_norm": 1.1653919181246282, + "learning_rate": 6.971548615732324e-06, + "loss": 0.525, + "step": 7863 + }, + { + "epoch": 0.61, + "grad_norm": 1.1742388979261293, + "learning_rate": 6.9691540358635504e-06, + "loss": 0.5382, + "step": 7864 + }, + { + "epoch": 0.61, + "grad_norm": 1.177622320054089, + "learning_rate": 6.96675964734772e-06, + "loss": 0.516, + "step": 7865 + }, + { + "epoch": 0.61, + "grad_norm": 1.2093866076639326, + "learning_rate": 6.964365450336008e-06, + "loss": 0.5821, + "step": 7866 + }, + { + "epoch": 0.61, + "grad_norm": 1.100729336404497, + "learning_rate": 6.961971444979563e-06, + "loss": 0.486, + "step": 7867 + }, + { + "epoch": 0.61, + "grad_norm": 1.0935850291497822, + "learning_rate": 6.959577631429535e-06, + "loss": 0.5151, + "step": 7868 + }, + { + "epoch": 0.61, + "grad_norm": 1.2244038629951342, + "learning_rate": 6.9571840098370566e-06, + "loss": 0.5779, + "step": 7869 + }, + { + "epoch": 0.61, + "grad_norm": 1.031875478465976, + "learning_rate": 6.95479058035325e-06, + "loss": 0.4658, + "step": 7870 + }, + { + "epoch": 0.61, + "grad_norm": 1.0577686092224674, + "learning_rate": 6.952397343129232e-06, + "loss": 0.4849, + "step": 7871 + }, + { + "epoch": 0.61, + "grad_norm": 1.1623036967423783, + "learning_rate": 6.950004298316086e-06, + "loss": 0.5704, + "step": 7872 + }, + { + "epoch": 0.61, + "grad_norm": 1.1364622055999398, + "learning_rate": 6.947611446064908e-06, + "loss": 0.5005, + "step": 7873 + }, + { + "epoch": 0.61, + "grad_norm": 1.1683675027894271, + "learning_rate": 6.945218786526764e-06, + "loss": 0.5791, + "step": 7874 + }, + { + "epoch": 0.61, + "grad_norm": 1.1427361794726965, + "learning_rate": 6.94282631985272e-06, + "loss": 0.5238, + "step": 7875 + }, + { + "epoch": 0.61, + "grad_norm": 1.2391517058655634, + "learning_rate": 6.940434046193824e-06, + "loss": 0.5186, + "step": 7876 + }, + { + "epoch": 0.61, + "grad_norm": 1.1726582262199163, + "learning_rate": 6.9380419657011146e-06, + "loss": 0.5252, + "step": 7877 + }, + { + "epoch": 0.61, + "grad_norm": 1.2253517930283762, + "learning_rate": 6.9356500785256135e-06, + "loss": 0.55, + "step": 7878 + }, + { + "epoch": 0.61, + "grad_norm": 1.1347181930740233, + "learning_rate": 6.933258384818329e-06, + "loss": 0.4905, + "step": 7879 + }, + { + "epoch": 0.61, + "grad_norm": 1.2204177400778107, + "learning_rate": 6.930866884730265e-06, + "loss": 0.5451, + "step": 7880 + }, + { + "epoch": 0.61, + "grad_norm": 1.1676987556021625, + "learning_rate": 6.92847557841241e-06, + "loss": 0.5668, + "step": 7881 + }, + { + "epoch": 0.61, + "grad_norm": 1.1790499448575396, + "learning_rate": 6.926084466015744e-06, + "loss": 0.5273, + "step": 7882 + }, + { + "epoch": 0.61, + "grad_norm": 1.1865000781361859, + "learning_rate": 6.92369354769122e-06, + "loss": 0.4774, + "step": 7883 + }, + { + "epoch": 0.61, + "grad_norm": 1.347984915198981, + "learning_rate": 6.921302823589793e-06, + "loss": 0.5311, + "step": 7884 + }, + { + "epoch": 0.61, + "grad_norm": 1.0963504395531058, + "learning_rate": 6.918912293862403e-06, + "loss": 0.4951, + "step": 7885 + }, + { + "epoch": 0.61, + "grad_norm": 1.1679240444647572, + "learning_rate": 6.916521958659977e-06, + "loss": 0.5576, + "step": 7886 + }, + { + "epoch": 0.61, + "grad_norm": 1.154004674806167, + "learning_rate": 6.914131818133432e-06, + "loss": 0.5403, + "step": 7887 + }, + { + "epoch": 0.61, + "grad_norm": 1.1454715966941336, + "learning_rate": 6.911741872433664e-06, + "loss": 0.5142, + "step": 7888 + }, + { + "epoch": 0.61, + "grad_norm": 1.1530003632795518, + "learning_rate": 6.909352121711563e-06, + "loss": 0.5231, + "step": 7889 + }, + { + "epoch": 0.61, + "grad_norm": 1.1927150354099902, + "learning_rate": 6.906962566118009e-06, + "loss": 0.5091, + "step": 7890 + }, + { + "epoch": 0.61, + "grad_norm": 1.2525211658326436, + "learning_rate": 6.904573205803864e-06, + "loss": 0.5635, + "step": 7891 + }, + { + "epoch": 0.61, + "grad_norm": 1.2196444139388871, + "learning_rate": 6.902184040919989e-06, + "loss": 0.5389, + "step": 7892 + }, + { + "epoch": 0.61, + "grad_norm": 1.1768412833466393, + "learning_rate": 6.899795071617214e-06, + "loss": 0.5252, + "step": 7893 + }, + { + "epoch": 0.61, + "grad_norm": 1.0752564567898892, + "learning_rate": 6.897406298046372e-06, + "loss": 0.5161, + "step": 7894 + }, + { + "epoch": 0.61, + "grad_norm": 1.2337644008425996, + "learning_rate": 6.895017720358275e-06, + "loss": 0.5328, + "step": 7895 + }, + { + "epoch": 0.61, + "grad_norm": 1.1659156334717273, + "learning_rate": 6.892629338703731e-06, + "loss": 0.4627, + "step": 7896 + }, + { + "epoch": 0.61, + "grad_norm": 1.114672998879971, + "learning_rate": 6.890241153233532e-06, + "loss": 0.51, + "step": 7897 + }, + { + "epoch": 0.61, + "grad_norm": 1.0345112068413678, + "learning_rate": 6.88785316409845e-06, + "loss": 0.4291, + "step": 7898 + }, + { + "epoch": 0.61, + "grad_norm": 1.2609283995270413, + "learning_rate": 6.885465371449254e-06, + "loss": 0.5425, + "step": 7899 + }, + { + "epoch": 0.61, + "grad_norm": 1.1207460402538525, + "learning_rate": 6.883077775436697e-06, + "loss": 0.5067, + "step": 7900 + }, + { + "epoch": 0.61, + "grad_norm": 1.0903690945125262, + "learning_rate": 6.880690376211522e-06, + "loss": 0.5209, + "step": 7901 + }, + { + "epoch": 0.61, + "grad_norm": 1.1704934686920958, + "learning_rate": 6.878303173924461e-06, + "loss": 0.5338, + "step": 7902 + }, + { + "epoch": 0.61, + "grad_norm": 1.2291753359127904, + "learning_rate": 6.875916168726225e-06, + "loss": 0.5307, + "step": 7903 + }, + { + "epoch": 0.61, + "grad_norm": 1.0942555485341354, + "learning_rate": 6.8735293607675145e-06, + "loss": 0.465, + "step": 7904 + }, + { + "epoch": 0.61, + "grad_norm": 1.1536306146288118, + "learning_rate": 6.8711427501990295e-06, + "loss": 0.5163, + "step": 7905 + }, + { + "epoch": 0.61, + "grad_norm": 1.1635876909725476, + "learning_rate": 6.8687563371714425e-06, + "loss": 0.5227, + "step": 7906 + }, + { + "epoch": 0.61, + "grad_norm": 1.165310031175236, + "learning_rate": 6.866370121835428e-06, + "loss": 0.4596, + "step": 7907 + }, + { + "epoch": 0.61, + "grad_norm": 1.1714660948382547, + "learning_rate": 6.863984104341634e-06, + "loss": 0.5202, + "step": 7908 + }, + { + "epoch": 0.61, + "grad_norm": 1.1461271025024393, + "learning_rate": 6.861598284840701e-06, + "loss": 0.5232, + "step": 7909 + }, + { + "epoch": 0.61, + "grad_norm": 1.0713591643967781, + "learning_rate": 6.859212663483261e-06, + "loss": 0.5011, + "step": 7910 + }, + { + "epoch": 0.61, + "grad_norm": 1.2168841139379114, + "learning_rate": 6.8568272404199275e-06, + "loss": 0.5276, + "step": 7911 + }, + { + "epoch": 0.61, + "grad_norm": 1.3146629222751425, + "learning_rate": 6.854442015801312e-06, + "loss": 0.5626, + "step": 7912 + }, + { + "epoch": 0.61, + "grad_norm": 1.1516127514582708, + "learning_rate": 6.852056989778001e-06, + "loss": 0.5152, + "step": 7913 + }, + { + "epoch": 0.61, + "grad_norm": 1.1675119176415645, + "learning_rate": 6.849672162500569e-06, + "loss": 0.5379, + "step": 7914 + }, + { + "epoch": 0.61, + "grad_norm": 1.1914884945357567, + "learning_rate": 6.847287534119589e-06, + "loss": 0.574, + "step": 7915 + }, + { + "epoch": 0.61, + "grad_norm": 1.0768226276045214, + "learning_rate": 6.844903104785611e-06, + "loss": 0.5135, + "step": 7916 + }, + { + "epoch": 0.61, + "grad_norm": 1.1621777939089832, + "learning_rate": 6.842518874649182e-06, + "loss": 0.5081, + "step": 7917 + }, + { + "epoch": 0.61, + "grad_norm": 1.0835137522678033, + "learning_rate": 6.840134843860826e-06, + "loss": 0.5088, + "step": 7918 + }, + { + "epoch": 0.61, + "grad_norm": 1.2174607328940494, + "learning_rate": 6.837751012571059e-06, + "loss": 0.5342, + "step": 7919 + }, + { + "epoch": 0.61, + "grad_norm": 1.1238253076628746, + "learning_rate": 6.835367380930384e-06, + "loss": 0.531, + "step": 7920 + }, + { + "epoch": 0.61, + "grad_norm": 1.2086033629002453, + "learning_rate": 6.832983949089293e-06, + "loss": 0.5187, + "step": 7921 + }, + { + "epoch": 0.61, + "grad_norm": 1.1637631741934549, + "learning_rate": 6.83060071719827e-06, + "loss": 0.5124, + "step": 7922 + }, + { + "epoch": 0.61, + "grad_norm": 1.31596208544439, + "learning_rate": 6.828217685407774e-06, + "loss": 0.5818, + "step": 7923 + }, + { + "epoch": 0.61, + "grad_norm": 1.1935038627398995, + "learning_rate": 6.825834853868259e-06, + "loss": 0.5797, + "step": 7924 + }, + { + "epoch": 0.61, + "grad_norm": 1.4366428473431188, + "learning_rate": 6.823452222730162e-06, + "loss": 0.5467, + "step": 7925 + }, + { + "epoch": 0.61, + "grad_norm": 1.0911741169592177, + "learning_rate": 6.82106979214392e-06, + "loss": 0.4954, + "step": 7926 + }, + { + "epoch": 0.61, + "grad_norm": 1.1608370443550886, + "learning_rate": 6.8186875622599434e-06, + "loss": 0.5174, + "step": 7927 + }, + { + "epoch": 0.62, + "grad_norm": 1.1283981605418782, + "learning_rate": 6.8163055332286344e-06, + "loss": 0.4814, + "step": 7928 + }, + { + "epoch": 0.62, + "grad_norm": 1.1954016558738603, + "learning_rate": 6.81392370520038e-06, + "loss": 0.4782, + "step": 7929 + }, + { + "epoch": 0.62, + "grad_norm": 1.2694381444975502, + "learning_rate": 6.81154207832556e-06, + "loss": 0.5992, + "step": 7930 + }, + { + "epoch": 0.62, + "grad_norm": 1.1908383376058862, + "learning_rate": 6.809160652754539e-06, + "loss": 0.534, + "step": 7931 + }, + { + "epoch": 0.62, + "grad_norm": 1.288604007447456, + "learning_rate": 6.806779428637671e-06, + "loss": 0.5975, + "step": 7932 + }, + { + "epoch": 0.62, + "grad_norm": 1.227494584205629, + "learning_rate": 6.804398406125295e-06, + "loss": 0.5705, + "step": 7933 + }, + { + "epoch": 0.62, + "grad_norm": 1.2062536407573883, + "learning_rate": 6.802017585367728e-06, + "loss": 0.541, + "step": 7934 + }, + { + "epoch": 0.62, + "grad_norm": 1.229015109314627, + "learning_rate": 6.799636966515293e-06, + "loss": 0.5743, + "step": 7935 + }, + { + "epoch": 0.62, + "grad_norm": 1.2845677010798349, + "learning_rate": 6.797256549718287e-06, + "loss": 0.5599, + "step": 7936 + }, + { + "epoch": 0.62, + "grad_norm": 1.1578203311711954, + "learning_rate": 6.794876335127002e-06, + "loss": 0.5248, + "step": 7937 + }, + { + "epoch": 0.62, + "grad_norm": 1.2289615664730857, + "learning_rate": 6.79249632289171e-06, + "loss": 0.5245, + "step": 7938 + }, + { + "epoch": 0.62, + "grad_norm": 1.2706730329795046, + "learning_rate": 6.790116513162672e-06, + "loss": 0.5255, + "step": 7939 + }, + { + "epoch": 0.62, + "grad_norm": 1.2013442497314895, + "learning_rate": 6.787736906090139e-06, + "loss": 0.5322, + "step": 7940 + }, + { + "epoch": 0.62, + "grad_norm": 1.2509952874312156, + "learning_rate": 6.7853575018243504e-06, + "loss": 0.5498, + "step": 7941 + }, + { + "epoch": 0.62, + "grad_norm": 1.1785243786672281, + "learning_rate": 6.7829783005155235e-06, + "loss": 0.5028, + "step": 7942 + }, + { + "epoch": 0.62, + "grad_norm": 1.2342227890232524, + "learning_rate": 6.780599302313882e-06, + "loss": 0.5354, + "step": 7943 + }, + { + "epoch": 0.62, + "grad_norm": 1.2008684969237855, + "learning_rate": 6.778220507369609e-06, + "loss": 0.5699, + "step": 7944 + }, + { + "epoch": 0.62, + "grad_norm": 1.0941171847212174, + "learning_rate": 6.7758419158328995e-06, + "loss": 0.5166, + "step": 7945 + }, + { + "epoch": 0.62, + "grad_norm": 1.1501688978310545, + "learning_rate": 6.773463527853925e-06, + "loss": 0.4776, + "step": 7946 + }, + { + "epoch": 0.62, + "grad_norm": 1.1178643903667111, + "learning_rate": 6.7710853435828435e-06, + "loss": 0.5202, + "step": 7947 + }, + { + "epoch": 0.62, + "grad_norm": 1.1799499017781123, + "learning_rate": 6.768707363169809e-06, + "loss": 0.4878, + "step": 7948 + }, + { + "epoch": 0.62, + "grad_norm": 1.1902226306322088, + "learning_rate": 6.766329586764944e-06, + "loss": 0.5255, + "step": 7949 + }, + { + "epoch": 0.62, + "grad_norm": 1.127517003779116, + "learning_rate": 6.7639520145183754e-06, + "loss": 0.4937, + "step": 7950 + }, + { + "epoch": 0.62, + "grad_norm": 1.1889673503246165, + "learning_rate": 6.761574646580215e-06, + "loss": 0.5909, + "step": 7951 + }, + { + "epoch": 0.62, + "grad_norm": 1.207067001455262, + "learning_rate": 6.759197483100553e-06, + "loss": 0.5352, + "step": 7952 + }, + { + "epoch": 0.62, + "grad_norm": 1.1140449982668799, + "learning_rate": 6.756820524229477e-06, + "loss": 0.49, + "step": 7953 + }, + { + "epoch": 0.62, + "grad_norm": 1.3042881520069007, + "learning_rate": 6.754443770117052e-06, + "loss": 0.607, + "step": 7954 + }, + { + "epoch": 0.62, + "grad_norm": 1.1481597460710176, + "learning_rate": 6.7520672209133355e-06, + "loss": 0.5398, + "step": 7955 + }, + { + "epoch": 0.62, + "grad_norm": 1.1262324787601778, + "learning_rate": 6.749690876768374e-06, + "loss": 0.4918, + "step": 7956 + }, + { + "epoch": 0.62, + "grad_norm": 1.2529525695281212, + "learning_rate": 6.747314737832196e-06, + "loss": 0.5323, + "step": 7957 + }, + { + "epoch": 0.62, + "grad_norm": 1.1282482407478194, + "learning_rate": 6.744938804254823e-06, + "loss": 0.483, + "step": 7958 + }, + { + "epoch": 0.62, + "grad_norm": 1.380644440278784, + "learning_rate": 6.742563076186255e-06, + "loss": 0.6295, + "step": 7959 + }, + { + "epoch": 0.62, + "grad_norm": 1.180658559731154, + "learning_rate": 6.740187553776485e-06, + "loss": 0.5184, + "step": 7960 + }, + { + "epoch": 0.62, + "grad_norm": 1.167019359320177, + "learning_rate": 6.737812237175494e-06, + "loss": 0.5266, + "step": 7961 + }, + { + "epoch": 0.62, + "grad_norm": 1.2478725926550733, + "learning_rate": 6.735437126533246e-06, + "loss": 0.5071, + "step": 7962 + }, + { + "epoch": 0.62, + "grad_norm": 1.1053018612466925, + "learning_rate": 6.7330622219997e-06, + "loss": 0.4912, + "step": 7963 + }, + { + "epoch": 0.62, + "grad_norm": 1.1683931122005797, + "learning_rate": 6.730687523724787e-06, + "loss": 0.5673, + "step": 7964 + }, + { + "epoch": 0.62, + "grad_norm": 1.0695282227589669, + "learning_rate": 6.728313031858437e-06, + "loss": 0.4512, + "step": 7965 + }, + { + "epoch": 0.62, + "grad_norm": 1.1487810827622318, + "learning_rate": 6.725938746550569e-06, + "loss": 0.4804, + "step": 7966 + }, + { + "epoch": 0.62, + "grad_norm": 1.2762308201109906, + "learning_rate": 6.723564667951076e-06, + "loss": 0.6093, + "step": 7967 + }, + { + "epoch": 0.62, + "grad_norm": 1.1882297883436632, + "learning_rate": 6.7211907962098535e-06, + "loss": 0.5255, + "step": 7968 + }, + { + "epoch": 0.62, + "grad_norm": 1.170393248600238, + "learning_rate": 6.718817131476769e-06, + "loss": 0.5315, + "step": 7969 + }, + { + "epoch": 0.62, + "grad_norm": 1.1443256284604553, + "learning_rate": 6.7164436739016896e-06, + "loss": 0.5166, + "step": 7970 + }, + { + "epoch": 0.62, + "grad_norm": 1.211035004658763, + "learning_rate": 6.714070423634461e-06, + "loss": 0.5379, + "step": 7971 + }, + { + "epoch": 0.62, + "grad_norm": 1.2546069602722156, + "learning_rate": 6.7116973808249185e-06, + "loss": 0.5592, + "step": 7972 + }, + { + "epoch": 0.62, + "grad_norm": 1.261041985287015, + "learning_rate": 6.709324545622891e-06, + "loss": 0.5618, + "step": 7973 + }, + { + "epoch": 0.62, + "grad_norm": 1.253423199644568, + "learning_rate": 6.706951918178177e-06, + "loss": 0.5893, + "step": 7974 + }, + { + "epoch": 0.62, + "grad_norm": 1.064863325402281, + "learning_rate": 6.70457949864058e-06, + "loss": 0.4969, + "step": 7975 + }, + { + "epoch": 0.62, + "grad_norm": 1.0972214784633085, + "learning_rate": 6.702207287159882e-06, + "loss": 0.4627, + "step": 7976 + }, + { + "epoch": 0.62, + "grad_norm": 1.2453620221140367, + "learning_rate": 6.699835283885851e-06, + "loss": 0.5873, + "step": 7977 + }, + { + "epoch": 0.62, + "grad_norm": 1.2013890512033891, + "learning_rate": 6.697463488968249e-06, + "loss": 0.4977, + "step": 7978 + }, + { + "epoch": 0.62, + "grad_norm": 1.1240667074642812, + "learning_rate": 6.695091902556812e-06, + "loss": 0.5582, + "step": 7979 + }, + { + "epoch": 0.62, + "grad_norm": 1.1377490640802794, + "learning_rate": 6.692720524801273e-06, + "loss": 0.5416, + "step": 7980 + }, + { + "epoch": 0.62, + "grad_norm": 1.164344087887588, + "learning_rate": 6.690349355851353e-06, + "loss": 0.5033, + "step": 7981 + }, + { + "epoch": 0.62, + "grad_norm": 1.1565506518462345, + "learning_rate": 6.687978395856753e-06, + "loss": 0.5019, + "step": 7982 + }, + { + "epoch": 0.62, + "grad_norm": 1.1000411437749689, + "learning_rate": 6.6856076449671644e-06, + "loss": 0.468, + "step": 7983 + }, + { + "epoch": 0.62, + "grad_norm": 1.229097504474967, + "learning_rate": 6.683237103332268e-06, + "loss": 0.5227, + "step": 7984 + }, + { + "epoch": 0.62, + "grad_norm": 1.1920046944909812, + "learning_rate": 6.6808667711017224e-06, + "loss": 0.5343, + "step": 7985 + }, + { + "epoch": 0.62, + "grad_norm": 1.185502882617292, + "learning_rate": 6.6784966484251845e-06, + "loss": 0.4656, + "step": 7986 + }, + { + "epoch": 0.62, + "grad_norm": 1.0356141427207093, + "learning_rate": 6.676126735452286e-06, + "loss": 0.4529, + "step": 7987 + }, + { + "epoch": 0.62, + "grad_norm": 1.0815745467490792, + "learning_rate": 6.673757032332657e-06, + "loss": 0.4856, + "step": 7988 + }, + { + "epoch": 0.62, + "grad_norm": 1.1164200287403079, + "learning_rate": 6.671387539215912e-06, + "loss": 0.5345, + "step": 7989 + }, + { + "epoch": 0.62, + "grad_norm": 1.2616897910370313, + "learning_rate": 6.66901825625164e-06, + "loss": 0.5522, + "step": 7990 + }, + { + "epoch": 0.62, + "grad_norm": 1.2044423805158917, + "learning_rate": 6.666649183589435e-06, + "loss": 0.5221, + "step": 7991 + }, + { + "epoch": 0.62, + "grad_norm": 1.242394915843983, + "learning_rate": 6.664280321378862e-06, + "loss": 0.513, + "step": 7992 + }, + { + "epoch": 0.62, + "grad_norm": 1.290600315994042, + "learning_rate": 6.661911669769481e-06, + "loss": 0.5323, + "step": 7993 + }, + { + "epoch": 0.62, + "grad_norm": 1.19674048800859, + "learning_rate": 6.659543228910844e-06, + "loss": 0.4963, + "step": 7994 + }, + { + "epoch": 0.62, + "grad_norm": 1.183839778698033, + "learning_rate": 6.657174998952474e-06, + "loss": 0.5644, + "step": 7995 + }, + { + "epoch": 0.62, + "grad_norm": 1.1439531969186578, + "learning_rate": 6.654806980043893e-06, + "loss": 0.5814, + "step": 7996 + }, + { + "epoch": 0.62, + "grad_norm": 1.2879789017287988, + "learning_rate": 6.6524391723346045e-06, + "loss": 0.5661, + "step": 7997 + }, + { + "epoch": 0.62, + "grad_norm": 1.1458643417786534, + "learning_rate": 6.650071575974104e-06, + "loss": 0.5082, + "step": 7998 + }, + { + "epoch": 0.62, + "grad_norm": 1.183028082810128, + "learning_rate": 6.64770419111187e-06, + "loss": 0.5384, + "step": 7999 + }, + { + "epoch": 0.62, + "grad_norm": 1.3180446672989186, + "learning_rate": 6.645337017897364e-06, + "loss": 0.6019, + "step": 8000 + }, + { + "epoch": 0.62, + "grad_norm": 1.1009780198933996, + "learning_rate": 6.64297005648004e-06, + "loss": 0.54, + "step": 8001 + }, + { + "epoch": 0.62, + "grad_norm": 1.1427698219097382, + "learning_rate": 6.640603307009337e-06, + "loss": 0.5164, + "step": 8002 + }, + { + "epoch": 0.62, + "grad_norm": 1.146831609336497, + "learning_rate": 6.638236769634677e-06, + "loss": 0.4894, + "step": 8003 + }, + { + "epoch": 0.62, + "grad_norm": 1.1889526617221147, + "learning_rate": 6.635870444505479e-06, + "loss": 0.5086, + "step": 8004 + }, + { + "epoch": 0.62, + "grad_norm": 1.435674088219694, + "learning_rate": 6.633504331771133e-06, + "loss": 0.5725, + "step": 8005 + }, + { + "epoch": 0.62, + "grad_norm": 1.1108984896092007, + "learning_rate": 6.631138431581028e-06, + "loss": 0.5025, + "step": 8006 + }, + { + "epoch": 0.62, + "grad_norm": 1.1144714418143478, + "learning_rate": 6.628772744084534e-06, + "loss": 0.5254, + "step": 8007 + }, + { + "epoch": 0.62, + "grad_norm": 1.2043269705190347, + "learning_rate": 6.62640726943101e-06, + "loss": 0.5023, + "step": 8008 + }, + { + "epoch": 0.62, + "grad_norm": 1.1540997073768196, + "learning_rate": 6.624042007769804e-06, + "loss": 0.5433, + "step": 8009 + }, + { + "epoch": 0.62, + "grad_norm": 1.1627167468896367, + "learning_rate": 6.621676959250239e-06, + "loss": 0.5499, + "step": 8010 + }, + { + "epoch": 0.62, + "grad_norm": 1.2574048062766159, + "learning_rate": 6.619312124021641e-06, + "loss": 0.5617, + "step": 8011 + }, + { + "epoch": 0.62, + "grad_norm": 1.07319820129949, + "learning_rate": 6.6169475022333075e-06, + "loss": 0.5464, + "step": 8012 + }, + { + "epoch": 0.62, + "grad_norm": 1.224178694779988, + "learning_rate": 6.614583094034533e-06, + "loss": 0.5208, + "step": 8013 + }, + { + "epoch": 0.62, + "grad_norm": 1.146619226690463, + "learning_rate": 6.612218899574598e-06, + "loss": 0.539, + "step": 8014 + }, + { + "epoch": 0.62, + "grad_norm": 1.0959674156071852, + "learning_rate": 6.6098549190027584e-06, + "loss": 0.4975, + "step": 8015 + }, + { + "epoch": 0.62, + "grad_norm": 1.1302351999678646, + "learning_rate": 6.60749115246827e-06, + "loss": 0.5423, + "step": 8016 + }, + { + "epoch": 0.62, + "grad_norm": 1.1742665619294352, + "learning_rate": 6.605127600120368e-06, + "loss": 0.4982, + "step": 8017 + }, + { + "epoch": 0.62, + "grad_norm": 1.115350827418697, + "learning_rate": 6.602764262108274e-06, + "loss": 0.5443, + "step": 8018 + }, + { + "epoch": 0.62, + "grad_norm": 1.1259885788976922, + "learning_rate": 6.600401138581205e-06, + "loss": 0.532, + "step": 8019 + }, + { + "epoch": 0.62, + "grad_norm": 1.1503542520313583, + "learning_rate": 6.598038229688347e-06, + "loss": 0.5526, + "step": 8020 + }, + { + "epoch": 0.62, + "grad_norm": 1.2370709301558145, + "learning_rate": 6.5956755355788895e-06, + "loss": 0.6149, + "step": 8021 + }, + { + "epoch": 0.62, + "grad_norm": 1.0876145620403093, + "learning_rate": 6.5933130564019976e-06, + "loss": 0.5004, + "step": 8022 + }, + { + "epoch": 0.62, + "grad_norm": 1.1713440264371344, + "learning_rate": 6.590950792306829e-06, + "loss": 0.5335, + "step": 8023 + }, + { + "epoch": 0.62, + "grad_norm": 1.180905452312604, + "learning_rate": 6.5885887434425275e-06, + "loss": 0.5437, + "step": 8024 + }, + { + "epoch": 0.62, + "grad_norm": 1.204417686082076, + "learning_rate": 6.586226909958218e-06, + "loss": 0.5385, + "step": 8025 + }, + { + "epoch": 0.62, + "grad_norm": 1.0626486505977926, + "learning_rate": 6.583865292003015e-06, + "loss": 0.5315, + "step": 8026 + }, + { + "epoch": 0.62, + "grad_norm": 1.0937478746666014, + "learning_rate": 6.581503889726022e-06, + "loss": 0.5229, + "step": 8027 + }, + { + "epoch": 0.62, + "grad_norm": 1.2301821904819097, + "learning_rate": 6.579142703276325e-06, + "loss": 0.5037, + "step": 8028 + }, + { + "epoch": 0.62, + "grad_norm": 1.099661844468642, + "learning_rate": 6.576781732803001e-06, + "loss": 0.5408, + "step": 8029 + }, + { + "epoch": 0.62, + "grad_norm": 1.131593245389108, + "learning_rate": 6.574420978455105e-06, + "loss": 0.538, + "step": 8030 + }, + { + "epoch": 0.62, + "grad_norm": 1.1688908537987979, + "learning_rate": 6.572060440381688e-06, + "loss": 0.5571, + "step": 8031 + }, + { + "epoch": 0.62, + "grad_norm": 1.103406080362045, + "learning_rate": 6.569700118731779e-06, + "loss": 0.4678, + "step": 8032 + }, + { + "epoch": 0.62, + "grad_norm": 1.1022211364856964, + "learning_rate": 6.5673400136544e-06, + "loss": 0.5141, + "step": 8033 + }, + { + "epoch": 0.62, + "grad_norm": 1.1820127799638995, + "learning_rate": 6.564980125298559e-06, + "loss": 0.5604, + "step": 8034 + }, + { + "epoch": 0.62, + "grad_norm": 1.158854000934489, + "learning_rate": 6.562620453813242e-06, + "loss": 0.5273, + "step": 8035 + }, + { + "epoch": 0.62, + "grad_norm": 1.2873508996645737, + "learning_rate": 6.560260999347432e-06, + "loss": 0.4882, + "step": 8036 + }, + { + "epoch": 0.62, + "grad_norm": 1.1934890801449982, + "learning_rate": 6.557901762050091e-06, + "loss": 0.5406, + "step": 8037 + }, + { + "epoch": 0.62, + "grad_norm": 1.2175066547877735, + "learning_rate": 6.555542742070169e-06, + "loss": 0.5582, + "step": 8038 + }, + { + "epoch": 0.62, + "grad_norm": 1.2787965684393074, + "learning_rate": 6.553183939556608e-06, + "loss": 0.5725, + "step": 8039 + }, + { + "epoch": 0.62, + "grad_norm": 1.1375518053435603, + "learning_rate": 6.550825354658328e-06, + "loss": 0.5195, + "step": 8040 + }, + { + "epoch": 0.62, + "grad_norm": 1.1587409946457963, + "learning_rate": 6.548466987524238e-06, + "loss": 0.5572, + "step": 8041 + }, + { + "epoch": 0.62, + "grad_norm": 1.3383814002051737, + "learning_rate": 6.546108838303233e-06, + "loss": 0.517, + "step": 8042 + }, + { + "epoch": 0.62, + "grad_norm": 1.1342899553350756, + "learning_rate": 6.543750907144196e-06, + "loss": 0.5599, + "step": 8043 + }, + { + "epoch": 0.62, + "grad_norm": 1.2611076835738857, + "learning_rate": 6.5413931941959994e-06, + "loss": 0.564, + "step": 8044 + }, + { + "epoch": 0.62, + "grad_norm": 1.1934999174030916, + "learning_rate": 6.539035699607494e-06, + "loss": 0.538, + "step": 8045 + }, + { + "epoch": 0.62, + "grad_norm": 1.1913859881569846, + "learning_rate": 6.53667842352752e-06, + "loss": 0.5237, + "step": 8046 + }, + { + "epoch": 0.62, + "grad_norm": 1.142456987268308, + "learning_rate": 6.534321366104905e-06, + "loss": 0.5534, + "step": 8047 + }, + { + "epoch": 0.62, + "grad_norm": 1.2650165743740918, + "learning_rate": 6.531964527488463e-06, + "loss": 0.5439, + "step": 8048 + }, + { + "epoch": 0.62, + "grad_norm": 1.177078241432902, + "learning_rate": 6.529607907826994e-06, + "loss": 0.5396, + "step": 8049 + }, + { + "epoch": 0.62, + "grad_norm": 1.293195952307599, + "learning_rate": 6.527251507269283e-06, + "loss": 0.598, + "step": 8050 + }, + { + "epoch": 0.62, + "grad_norm": 1.2534863966366128, + "learning_rate": 6.524895325964102e-06, + "loss": 0.5722, + "step": 8051 + }, + { + "epoch": 0.62, + "grad_norm": 1.1097887906184716, + "learning_rate": 6.522539364060205e-06, + "loss": 0.5223, + "step": 8052 + }, + { + "epoch": 0.62, + "grad_norm": 1.1559349352500576, + "learning_rate": 6.5201836217063395e-06, + "loss": 0.5119, + "step": 8053 + }, + { + "epoch": 0.62, + "grad_norm": 1.160296653189349, + "learning_rate": 6.517828099051238e-06, + "loss": 0.5649, + "step": 8054 + }, + { + "epoch": 0.62, + "grad_norm": 1.0796615045401898, + "learning_rate": 6.515472796243615e-06, + "loss": 0.4854, + "step": 8055 + }, + { + "epoch": 0.62, + "grad_norm": 1.180626754234388, + "learning_rate": 6.513117713432171e-06, + "loss": 0.5219, + "step": 8056 + }, + { + "epoch": 0.63, + "grad_norm": 1.2019430522725991, + "learning_rate": 6.510762850765594e-06, + "loss": 0.4822, + "step": 8057 + }, + { + "epoch": 0.63, + "grad_norm": 1.3580501282615738, + "learning_rate": 6.50840820839256e-06, + "loss": 0.5589, + "step": 8058 + }, + { + "epoch": 0.63, + "grad_norm": 1.0983115307134994, + "learning_rate": 6.506053786461732e-06, + "loss": 0.5241, + "step": 8059 + }, + { + "epoch": 0.63, + "grad_norm": 1.1142515003450155, + "learning_rate": 6.5036995851217565e-06, + "loss": 0.5246, + "step": 8060 + }, + { + "epoch": 0.63, + "grad_norm": 1.135803588362028, + "learning_rate": 6.501345604521263e-06, + "loss": 0.5331, + "step": 8061 + }, + { + "epoch": 0.63, + "grad_norm": 1.1589763048364812, + "learning_rate": 6.49899184480887e-06, + "loss": 0.5037, + "step": 8062 + }, + { + "epoch": 0.63, + "grad_norm": 1.1659016769169397, + "learning_rate": 6.496638306133186e-06, + "loss": 0.5319, + "step": 8063 + }, + { + "epoch": 0.63, + "grad_norm": 1.1396461885588618, + "learning_rate": 6.494284988642803e-06, + "loss": 0.489, + "step": 8064 + }, + { + "epoch": 0.63, + "grad_norm": 1.190222981182132, + "learning_rate": 6.491931892486294e-06, + "loss": 0.5518, + "step": 8065 + }, + { + "epoch": 0.63, + "grad_norm": 1.1799287865049926, + "learning_rate": 6.489579017812224e-06, + "loss": 0.5291, + "step": 8066 + }, + { + "epoch": 0.63, + "grad_norm": 1.2032853552234715, + "learning_rate": 6.487226364769141e-06, + "loss": 0.5397, + "step": 8067 + }, + { + "epoch": 0.63, + "grad_norm": 1.1988851971805627, + "learning_rate": 6.484873933505581e-06, + "loss": 0.5361, + "step": 8068 + }, + { + "epoch": 0.63, + "grad_norm": 1.1916125009582101, + "learning_rate": 6.482521724170068e-06, + "loss": 0.5381, + "step": 8069 + }, + { + "epoch": 0.63, + "grad_norm": 1.1859034295898843, + "learning_rate": 6.480169736911104e-06, + "loss": 0.5412, + "step": 8070 + }, + { + "epoch": 0.63, + "grad_norm": 1.1138086508604188, + "learning_rate": 6.477817971877186e-06, + "loss": 0.5013, + "step": 8071 + }, + { + "epoch": 0.63, + "grad_norm": 1.2226249032679162, + "learning_rate": 6.47546642921679e-06, + "loss": 0.4959, + "step": 8072 + }, + { + "epoch": 0.63, + "grad_norm": 1.1791949222715465, + "learning_rate": 6.4731151090783814e-06, + "loss": 0.498, + "step": 8073 + }, + { + "epoch": 0.63, + "grad_norm": 1.1192722209899926, + "learning_rate": 6.470764011610415e-06, + "loss": 0.544, + "step": 8074 + }, + { + "epoch": 0.63, + "grad_norm": 1.255417480561332, + "learning_rate": 6.468413136961325e-06, + "loss": 0.5461, + "step": 8075 + }, + { + "epoch": 0.63, + "grad_norm": 1.2249791805288721, + "learning_rate": 6.4660624852795336e-06, + "loss": 0.5201, + "step": 8076 + }, + { + "epoch": 0.63, + "grad_norm": 1.2620239361126833, + "learning_rate": 6.463712056713449e-06, + "loss": 0.568, + "step": 8077 + }, + { + "epoch": 0.63, + "grad_norm": 1.201600682082281, + "learning_rate": 6.461361851411466e-06, + "loss": 0.5274, + "step": 8078 + }, + { + "epoch": 0.63, + "grad_norm": 1.294499162864488, + "learning_rate": 6.45901186952197e-06, + "loss": 0.5912, + "step": 8079 + }, + { + "epoch": 0.63, + "grad_norm": 1.1582935934774032, + "learning_rate": 6.456662111193322e-06, + "loss": 0.5443, + "step": 8080 + }, + { + "epoch": 0.63, + "grad_norm": 1.2203371521710187, + "learning_rate": 6.454312576573878e-06, + "loss": 0.567, + "step": 8081 + }, + { + "epoch": 0.63, + "grad_norm": 1.172150388784612, + "learning_rate": 6.451963265811971e-06, + "loss": 0.4928, + "step": 8082 + }, + { + "epoch": 0.63, + "grad_norm": 1.1801006780929693, + "learning_rate": 6.449614179055929e-06, + "loss": 0.563, + "step": 8083 + }, + { + "epoch": 0.63, + "grad_norm": 1.1425055065064067, + "learning_rate": 6.447265316454063e-06, + "loss": 0.5077, + "step": 8084 + }, + { + "epoch": 0.63, + "grad_norm": 1.1370770915358563, + "learning_rate": 6.444916678154667e-06, + "loss": 0.4995, + "step": 8085 + }, + { + "epoch": 0.63, + "grad_norm": 1.104175687549137, + "learning_rate": 6.442568264306024e-06, + "loss": 0.5123, + "step": 8086 + }, + { + "epoch": 0.63, + "grad_norm": 1.2270560273962663, + "learning_rate": 6.440220075056398e-06, + "loss": 0.5398, + "step": 8087 + }, + { + "epoch": 0.63, + "grad_norm": 1.241972082880874, + "learning_rate": 6.437872110554044e-06, + "loss": 0.5909, + "step": 8088 + }, + { + "epoch": 0.63, + "grad_norm": 1.1515595952363562, + "learning_rate": 6.4355243709472045e-06, + "loss": 0.5595, + "step": 8089 + }, + { + "epoch": 0.63, + "grad_norm": 1.0947170342227233, + "learning_rate": 6.433176856384103e-06, + "loss": 0.5011, + "step": 8090 + }, + { + "epoch": 0.63, + "grad_norm": 1.1940678118517964, + "learning_rate": 6.430829567012946e-06, + "loss": 0.5401, + "step": 8091 + }, + { + "epoch": 0.63, + "grad_norm": 1.2344310120054822, + "learning_rate": 6.428482502981933e-06, + "loss": 0.5629, + "step": 8092 + }, + { + "epoch": 0.63, + "grad_norm": 1.1614183438976375, + "learning_rate": 6.426135664439246e-06, + "loss": 0.5458, + "step": 8093 + }, + { + "epoch": 0.63, + "grad_norm": 1.0956054752784061, + "learning_rate": 6.423789051533056e-06, + "loss": 0.5425, + "step": 8094 + }, + { + "epoch": 0.63, + "grad_norm": 1.221843266379924, + "learning_rate": 6.4214426644115104e-06, + "loss": 0.5249, + "step": 8095 + }, + { + "epoch": 0.63, + "grad_norm": 1.1928426120026385, + "learning_rate": 6.419096503222757e-06, + "loss": 0.5446, + "step": 8096 + }, + { + "epoch": 0.63, + "grad_norm": 1.228508492890292, + "learning_rate": 6.416750568114911e-06, + "loss": 0.5188, + "step": 8097 + }, + { + "epoch": 0.63, + "grad_norm": 1.192796490116827, + "learning_rate": 6.414404859236091e-06, + "loss": 0.5588, + "step": 8098 + }, + { + "epoch": 0.63, + "grad_norm": 1.2827298526880522, + "learning_rate": 6.412059376734392e-06, + "loss": 0.5426, + "step": 8099 + }, + { + "epoch": 0.63, + "grad_norm": 1.1082316672273647, + "learning_rate": 6.409714120757895e-06, + "loss": 0.5164, + "step": 8100 + }, + { + "epoch": 0.63, + "grad_norm": 1.2100529547208914, + "learning_rate": 6.407369091454672e-06, + "loss": 0.5197, + "step": 8101 + }, + { + "epoch": 0.63, + "grad_norm": 1.2260628915838525, + "learning_rate": 6.40502428897277e-06, + "loss": 0.5549, + "step": 8102 + }, + { + "epoch": 0.63, + "grad_norm": 1.1489381801651146, + "learning_rate": 6.402679713460234e-06, + "loss": 0.5005, + "step": 8103 + }, + { + "epoch": 0.63, + "grad_norm": 1.174123412707984, + "learning_rate": 6.400335365065087e-06, + "loss": 0.5305, + "step": 8104 + }, + { + "epoch": 0.63, + "grad_norm": 1.2181456118738843, + "learning_rate": 6.397991243935339e-06, + "loss": 0.5451, + "step": 8105 + }, + { + "epoch": 0.63, + "grad_norm": 1.1271746069052053, + "learning_rate": 6.395647350218992e-06, + "loss": 0.4685, + "step": 8106 + }, + { + "epoch": 0.63, + "grad_norm": 1.1139567149740126, + "learning_rate": 6.393303684064019e-06, + "loss": 0.5267, + "step": 8107 + }, + { + "epoch": 0.63, + "grad_norm": 1.0985159985640889, + "learning_rate": 6.390960245618394e-06, + "loss": 0.4817, + "step": 8108 + }, + { + "epoch": 0.63, + "grad_norm": 1.1378639980229739, + "learning_rate": 6.388617035030069e-06, + "loss": 0.577, + "step": 8109 + }, + { + "epoch": 0.63, + "grad_norm": 1.2483777964117484, + "learning_rate": 6.386274052446982e-06, + "loss": 0.5169, + "step": 8110 + }, + { + "epoch": 0.63, + "grad_norm": 1.1303506869951976, + "learning_rate": 6.383931298017063e-06, + "loss": 0.4857, + "step": 8111 + }, + { + "epoch": 0.63, + "grad_norm": 1.1579573114726083, + "learning_rate": 6.381588771888213e-06, + "loss": 0.512, + "step": 8112 + }, + { + "epoch": 0.63, + "grad_norm": 1.0717487583389012, + "learning_rate": 6.379246474208332e-06, + "loss": 0.5091, + "step": 8113 + }, + { + "epoch": 0.63, + "grad_norm": 1.1624755795282637, + "learning_rate": 6.376904405125304e-06, + "loss": 0.5323, + "step": 8114 + }, + { + "epoch": 0.63, + "grad_norm": 1.143481140638387, + "learning_rate": 6.374562564786993e-06, + "loss": 0.5195, + "step": 8115 + }, + { + "epoch": 0.63, + "grad_norm": 1.1233260628684765, + "learning_rate": 6.372220953341254e-06, + "loss": 0.5013, + "step": 8116 + }, + { + "epoch": 0.63, + "grad_norm": 1.080642911234163, + "learning_rate": 6.36987957093592e-06, + "loss": 0.5109, + "step": 8117 + }, + { + "epoch": 0.63, + "grad_norm": 1.1621251209378372, + "learning_rate": 6.367538417718817e-06, + "loss": 0.4952, + "step": 8118 + }, + { + "epoch": 0.63, + "grad_norm": 1.2936428652278116, + "learning_rate": 6.365197493837757e-06, + "loss": 0.5466, + "step": 8119 + }, + { + "epoch": 0.63, + "grad_norm": 1.0631700534681894, + "learning_rate": 6.362856799440531e-06, + "loss": 0.517, + "step": 8120 + }, + { + "epoch": 0.63, + "grad_norm": 1.1429910549404527, + "learning_rate": 6.360516334674924e-06, + "loss": 0.518, + "step": 8121 + }, + { + "epoch": 0.63, + "grad_norm": 1.2255141522179076, + "learning_rate": 6.358176099688693e-06, + "loss": 0.5635, + "step": 8122 + }, + { + "epoch": 0.63, + "grad_norm": 1.1898499877525728, + "learning_rate": 6.355836094629596e-06, + "loss": 0.5652, + "step": 8123 + }, + { + "epoch": 0.63, + "grad_norm": 1.2327496898204358, + "learning_rate": 6.353496319645369e-06, + "loss": 0.5372, + "step": 8124 + }, + { + "epoch": 0.63, + "grad_norm": 1.2067161568526497, + "learning_rate": 6.351156774883731e-06, + "loss": 0.554, + "step": 8125 + }, + { + "epoch": 0.63, + "grad_norm": 1.1636667282846072, + "learning_rate": 6.348817460492396e-06, + "loss": 0.4822, + "step": 8126 + }, + { + "epoch": 0.63, + "grad_norm": 1.1580437330213513, + "learning_rate": 6.346478376619046e-06, + "loss": 0.4633, + "step": 8127 + }, + { + "epoch": 0.63, + "grad_norm": 1.1926301766764904, + "learning_rate": 6.344139523411368e-06, + "loss": 0.514, + "step": 8128 + }, + { + "epoch": 0.63, + "grad_norm": 1.1422670120137954, + "learning_rate": 6.341800901017024e-06, + "loss": 0.5179, + "step": 8129 + }, + { + "epoch": 0.63, + "grad_norm": 1.086437467019316, + "learning_rate": 6.339462509583663e-06, + "loss": 0.4955, + "step": 8130 + }, + { + "epoch": 0.63, + "grad_norm": 1.1578803038724266, + "learning_rate": 6.337124349258923e-06, + "loss": 0.5505, + "step": 8131 + }, + { + "epoch": 0.63, + "grad_norm": 1.2396365189028185, + "learning_rate": 6.334786420190415e-06, + "loss": 0.5078, + "step": 8132 + }, + { + "epoch": 0.63, + "grad_norm": 1.1917749553553898, + "learning_rate": 6.3324487225257526e-06, + "loss": 0.5211, + "step": 8133 + }, + { + "epoch": 0.63, + "grad_norm": 1.1814016053131546, + "learning_rate": 6.330111256412525e-06, + "loss": 0.5064, + "step": 8134 + }, + { + "epoch": 0.63, + "grad_norm": 1.173899516697287, + "learning_rate": 6.3277740219983066e-06, + "loss": 0.5611, + "step": 8135 + }, + { + "epoch": 0.63, + "grad_norm": 1.0959501209081624, + "learning_rate": 6.325437019430665e-06, + "loss": 0.4615, + "step": 8136 + }, + { + "epoch": 0.63, + "grad_norm": 1.1833449501935949, + "learning_rate": 6.323100248857137e-06, + "loss": 0.5287, + "step": 8137 + }, + { + "epoch": 0.63, + "grad_norm": 1.1935617927660638, + "learning_rate": 6.320763710425262e-06, + "loss": 0.5522, + "step": 8138 + }, + { + "epoch": 0.63, + "grad_norm": 1.246307593863828, + "learning_rate": 6.318427404282557e-06, + "loss": 0.5709, + "step": 8139 + }, + { + "epoch": 0.63, + "grad_norm": 1.1493925936888927, + "learning_rate": 6.316091330576523e-06, + "loss": 0.5546, + "step": 8140 + }, + { + "epoch": 0.63, + "grad_norm": 1.2851754508859623, + "learning_rate": 6.313755489454654e-06, + "loss": 0.5924, + "step": 8141 + }, + { + "epoch": 0.63, + "grad_norm": 1.2259033764697531, + "learning_rate": 6.311419881064416e-06, + "loss": 0.5237, + "step": 8142 + }, + { + "epoch": 0.63, + "grad_norm": 1.1730304870578598, + "learning_rate": 6.309084505553269e-06, + "loss": 0.5347, + "step": 8143 + }, + { + "epoch": 0.63, + "grad_norm": 1.2374726552543034, + "learning_rate": 6.306749363068665e-06, + "loss": 0.5426, + "step": 8144 + }, + { + "epoch": 0.63, + "grad_norm": 1.1704444800648626, + "learning_rate": 6.304414453758024e-06, + "loss": 0.5033, + "step": 8145 + }, + { + "epoch": 0.63, + "grad_norm": 1.1944052051535528, + "learning_rate": 6.3020797777687705e-06, + "loss": 0.5466, + "step": 8146 + }, + { + "epoch": 0.63, + "grad_norm": 1.2234306442438918, + "learning_rate": 6.299745335248295e-06, + "loss": 0.4611, + "step": 8147 + }, + { + "epoch": 0.63, + "grad_norm": 1.1135698446354996, + "learning_rate": 6.297411126343988e-06, + "loss": 0.4788, + "step": 8148 + }, + { + "epoch": 0.63, + "grad_norm": 1.2336560340508356, + "learning_rate": 6.295077151203221e-06, + "loss": 0.5432, + "step": 8149 + }, + { + "epoch": 0.63, + "grad_norm": 1.0181511786498405, + "learning_rate": 6.292743409973345e-06, + "loss": 0.4528, + "step": 8150 + }, + { + "epoch": 0.63, + "grad_norm": 1.2724951317175053, + "learning_rate": 6.290409902801706e-06, + "loss": 0.5483, + "step": 8151 + }, + { + "epoch": 0.63, + "grad_norm": 1.1709874670496545, + "learning_rate": 6.288076629835633e-06, + "loss": 0.5352, + "step": 8152 + }, + { + "epoch": 0.63, + "grad_norm": 1.2000504880456666, + "learning_rate": 6.285743591222428e-06, + "loss": 0.5182, + "step": 8153 + }, + { + "epoch": 0.63, + "grad_norm": 1.1198374125281498, + "learning_rate": 6.283410787109396e-06, + "loss": 0.5429, + "step": 8154 + }, + { + "epoch": 0.63, + "grad_norm": 1.1693575458049734, + "learning_rate": 6.2810782176438145e-06, + "loss": 0.5388, + "step": 8155 + }, + { + "epoch": 0.63, + "grad_norm": 1.2620790044022467, + "learning_rate": 6.278745882972952e-06, + "loss": 0.5676, + "step": 8156 + }, + { + "epoch": 0.63, + "grad_norm": 1.1385941388927983, + "learning_rate": 6.276413783244064e-06, + "loss": 0.5279, + "step": 8157 + }, + { + "epoch": 0.63, + "grad_norm": 1.2475136347543503, + "learning_rate": 6.274081918604382e-06, + "loss": 0.5334, + "step": 8158 + }, + { + "epoch": 0.63, + "grad_norm": 1.1627673938822398, + "learning_rate": 6.271750289201134e-06, + "loss": 0.5334, + "step": 8159 + }, + { + "epoch": 0.63, + "grad_norm": 1.1870628355342407, + "learning_rate": 6.269418895181523e-06, + "loss": 0.5608, + "step": 8160 + }, + { + "epoch": 0.63, + "grad_norm": 1.285418637282944, + "learning_rate": 6.267087736692744e-06, + "loss": 0.5837, + "step": 8161 + }, + { + "epoch": 0.63, + "grad_norm": 1.0852448703949569, + "learning_rate": 6.26475681388198e-06, + "loss": 0.5261, + "step": 8162 + }, + { + "epoch": 0.63, + "grad_norm": 1.1479191551019983, + "learning_rate": 6.262426126896386e-06, + "loss": 0.5194, + "step": 8163 + }, + { + "epoch": 0.63, + "grad_norm": 1.198075286757198, + "learning_rate": 6.260095675883116e-06, + "loss": 0.5426, + "step": 8164 + }, + { + "epoch": 0.63, + "grad_norm": 1.5790903896674928, + "learning_rate": 6.257765460989298e-06, + "loss": 0.493, + "step": 8165 + }, + { + "epoch": 0.63, + "grad_norm": 1.193184747733611, + "learning_rate": 6.255435482362056e-06, + "loss": 0.5195, + "step": 8166 + }, + { + "epoch": 0.63, + "grad_norm": 1.140465032778158, + "learning_rate": 6.253105740148493e-06, + "loss": 0.4906, + "step": 8167 + }, + { + "epoch": 0.63, + "grad_norm": 1.0894346392223409, + "learning_rate": 6.2507762344956925e-06, + "loss": 0.4609, + "step": 8168 + }, + { + "epoch": 0.63, + "grad_norm": 1.1849297263639302, + "learning_rate": 6.248446965550735e-06, + "loss": 0.513, + "step": 8169 + }, + { + "epoch": 0.63, + "grad_norm": 1.0231387452150094, + "learning_rate": 6.246117933460673e-06, + "loss": 0.4737, + "step": 8170 + }, + { + "epoch": 0.63, + "grad_norm": 1.376445660520037, + "learning_rate": 6.2437891383725535e-06, + "loss": 0.5844, + "step": 8171 + }, + { + "epoch": 0.63, + "grad_norm": 1.1341401311520276, + "learning_rate": 6.241460580433411e-06, + "loss": 0.4763, + "step": 8172 + }, + { + "epoch": 0.63, + "grad_norm": 1.2005624605300265, + "learning_rate": 6.239132259790248e-06, + "loss": 0.5166, + "step": 8173 + }, + { + "epoch": 0.63, + "grad_norm": 1.1994956625652882, + "learning_rate": 6.23680417659007e-06, + "loss": 0.555, + "step": 8174 + }, + { + "epoch": 0.63, + "grad_norm": 1.1545582844827547, + "learning_rate": 6.234476330979859e-06, + "loss": 0.5358, + "step": 8175 + }, + { + "epoch": 0.63, + "grad_norm": 1.1117929247561533, + "learning_rate": 6.232148723106586e-06, + "loss": 0.517, + "step": 8176 + }, + { + "epoch": 0.63, + "grad_norm": 1.1711607218520421, + "learning_rate": 6.2298213531172055e-06, + "loss": 0.5025, + "step": 8177 + }, + { + "epoch": 0.63, + "grad_norm": 1.1826246971692047, + "learning_rate": 6.227494221158652e-06, + "loss": 0.4513, + "step": 8178 + }, + { + "epoch": 0.63, + "grad_norm": 1.1192390971109658, + "learning_rate": 6.225167327377852e-06, + "loss": 0.5284, + "step": 8179 + }, + { + "epoch": 0.63, + "grad_norm": 1.165086027415438, + "learning_rate": 6.222840671921715e-06, + "loss": 0.472, + "step": 8180 + }, + { + "epoch": 0.63, + "grad_norm": 1.1796114562327191, + "learning_rate": 6.220514254937131e-06, + "loss": 0.497, + "step": 8181 + }, + { + "epoch": 0.63, + "grad_norm": 1.2233615096500101, + "learning_rate": 6.218188076570988e-06, + "loss": 0.5247, + "step": 8182 + }, + { + "epoch": 0.63, + "grad_norm": 1.1942694108202858, + "learning_rate": 6.215862136970139e-06, + "loss": 0.5362, + "step": 8183 + }, + { + "epoch": 0.63, + "grad_norm": 1.2695039716871048, + "learning_rate": 6.213536436281438e-06, + "loss": 0.573, + "step": 8184 + }, + { + "epoch": 0.63, + "grad_norm": 1.2133352843642704, + "learning_rate": 6.211210974651716e-06, + "loss": 0.4941, + "step": 8185 + }, + { + "epoch": 0.64, + "grad_norm": 1.2585189446655267, + "learning_rate": 6.208885752227791e-06, + "loss": 0.5347, + "step": 8186 + }, + { + "epoch": 0.64, + "grad_norm": 1.1384544623025161, + "learning_rate": 6.2065607691564736e-06, + "loss": 0.4804, + "step": 8187 + }, + { + "epoch": 0.64, + "grad_norm": 1.228707060582671, + "learning_rate": 6.204236025584542e-06, + "loss": 0.5652, + "step": 8188 + }, + { + "epoch": 0.64, + "grad_norm": 1.2145456027958572, + "learning_rate": 6.201911521658777e-06, + "loss": 0.4816, + "step": 8189 + }, + { + "epoch": 0.64, + "grad_norm": 1.1635301127780793, + "learning_rate": 6.19958725752593e-06, + "loss": 0.541, + "step": 8190 + }, + { + "epoch": 0.64, + "grad_norm": 1.1273108696322753, + "learning_rate": 6.197263233332747e-06, + "loss": 0.5061, + "step": 8191 + }, + { + "epoch": 0.64, + "grad_norm": 1.1827566378061876, + "learning_rate": 6.19493944922596e-06, + "loss": 0.5096, + "step": 8192 + }, + { + "epoch": 0.64, + "grad_norm": 1.2844155910788315, + "learning_rate": 6.192615905352273e-06, + "loss": 0.521, + "step": 8193 + }, + { + "epoch": 0.64, + "grad_norm": 1.1024948395069822, + "learning_rate": 6.190292601858389e-06, + "loss": 0.4921, + "step": 8194 + }, + { + "epoch": 0.64, + "grad_norm": 1.1057808371480182, + "learning_rate": 6.1879695388909865e-06, + "loss": 0.441, + "step": 8195 + }, + { + "epoch": 0.64, + "grad_norm": 1.2051472698140704, + "learning_rate": 6.185646716596735e-06, + "loss": 0.4912, + "step": 8196 + }, + { + "epoch": 0.64, + "grad_norm": 1.262531408542438, + "learning_rate": 6.183324135122289e-06, + "loss": 0.5507, + "step": 8197 + }, + { + "epoch": 0.64, + "grad_norm": 1.2005207561015714, + "learning_rate": 6.181001794614279e-06, + "loss": 0.5171, + "step": 8198 + }, + { + "epoch": 0.64, + "grad_norm": 1.2581858584318575, + "learning_rate": 6.17867969521933e-06, + "loss": 0.5506, + "step": 8199 + }, + { + "epoch": 0.64, + "grad_norm": 1.0402499419425246, + "learning_rate": 6.176357837084046e-06, + "loss": 0.5074, + "step": 8200 + }, + { + "epoch": 0.64, + "grad_norm": 1.339137789413524, + "learning_rate": 6.17403622035502e-06, + "loss": 0.5281, + "step": 8201 + }, + { + "epoch": 0.64, + "grad_norm": 1.2314929877301088, + "learning_rate": 6.1717148451788265e-06, + "loss": 0.5521, + "step": 8202 + }, + { + "epoch": 0.64, + "grad_norm": 1.2629923814013706, + "learning_rate": 6.169393711702027e-06, + "loss": 0.5692, + "step": 8203 + }, + { + "epoch": 0.64, + "grad_norm": 1.1138745784765145, + "learning_rate": 6.167072820071167e-06, + "loss": 0.5313, + "step": 8204 + }, + { + "epoch": 0.64, + "grad_norm": 1.1767591294403938, + "learning_rate": 6.164752170432773e-06, + "loss": 0.5292, + "step": 8205 + }, + { + "epoch": 0.64, + "grad_norm": 1.1553288476317225, + "learning_rate": 6.162431762933361e-06, + "loss": 0.531, + "step": 8206 + }, + { + "epoch": 0.64, + "grad_norm": 1.1536558795201628, + "learning_rate": 6.160111597719433e-06, + "loss": 0.5361, + "step": 8207 + }, + { + "epoch": 0.64, + "grad_norm": 1.3086055470404578, + "learning_rate": 6.157791674937471e-06, + "loss": 0.5402, + "step": 8208 + }, + { + "epoch": 0.64, + "grad_norm": 1.2474205582729367, + "learning_rate": 6.1554719947339435e-06, + "loss": 0.5066, + "step": 8209 + }, + { + "epoch": 0.64, + "grad_norm": 1.1854479275533065, + "learning_rate": 6.153152557255303e-06, + "loss": 0.5158, + "step": 8210 + }, + { + "epoch": 0.64, + "grad_norm": 1.2329209854103622, + "learning_rate": 6.150833362647988e-06, + "loss": 0.5584, + "step": 8211 + }, + { + "epoch": 0.64, + "grad_norm": 1.3002949856818755, + "learning_rate": 6.148514411058424e-06, + "loss": 0.5073, + "step": 8212 + }, + { + "epoch": 0.64, + "grad_norm": 1.1295832415704656, + "learning_rate": 6.146195702633018e-06, + "loss": 0.5404, + "step": 8213 + }, + { + "epoch": 0.64, + "grad_norm": 1.1528537462217887, + "learning_rate": 6.143877237518158e-06, + "loss": 0.562, + "step": 8214 + }, + { + "epoch": 0.64, + "grad_norm": 1.2224765925906589, + "learning_rate": 6.141559015860221e-06, + "loss": 0.5699, + "step": 8215 + }, + { + "epoch": 0.64, + "grad_norm": 1.1697335067698311, + "learning_rate": 6.1392410378055725e-06, + "loss": 0.5119, + "step": 8216 + }, + { + "epoch": 0.64, + "grad_norm": 1.2611346236084933, + "learning_rate": 6.136923303500556e-06, + "loss": 0.5468, + "step": 8217 + }, + { + "epoch": 0.64, + "grad_norm": 1.3625432068859702, + "learning_rate": 6.134605813091503e-06, + "loss": 0.5605, + "step": 8218 + }, + { + "epoch": 0.64, + "grad_norm": 1.0981414374226928, + "learning_rate": 6.132288566724728e-06, + "loss": 0.5418, + "step": 8219 + }, + { + "epoch": 0.64, + "grad_norm": 1.1336824956815332, + "learning_rate": 6.129971564546529e-06, + "loss": 0.4852, + "step": 8220 + }, + { + "epoch": 0.64, + "grad_norm": 1.0734901315371415, + "learning_rate": 6.127654806703189e-06, + "loss": 0.493, + "step": 8221 + }, + { + "epoch": 0.64, + "grad_norm": 1.1775994421239755, + "learning_rate": 6.125338293340985e-06, + "loss": 0.5664, + "step": 8222 + }, + { + "epoch": 0.64, + "grad_norm": 1.0850686089406505, + "learning_rate": 6.123022024606165e-06, + "loss": 0.5001, + "step": 8223 + }, + { + "epoch": 0.64, + "grad_norm": 1.152421334451245, + "learning_rate": 6.120706000644965e-06, + "loss": 0.5164, + "step": 8224 + }, + { + "epoch": 0.64, + "grad_norm": 1.1588395992892193, + "learning_rate": 6.11839022160361e-06, + "loss": 0.5319, + "step": 8225 + }, + { + "epoch": 0.64, + "grad_norm": 1.2549046139734341, + "learning_rate": 6.116074687628305e-06, + "loss": 0.5439, + "step": 8226 + }, + { + "epoch": 0.64, + "grad_norm": 1.197136279239915, + "learning_rate": 6.113759398865247e-06, + "loss": 0.5232, + "step": 8227 + }, + { + "epoch": 0.64, + "grad_norm": 1.2277228825486712, + "learning_rate": 6.111444355460608e-06, + "loss": 0.512, + "step": 8228 + }, + { + "epoch": 0.64, + "grad_norm": 1.1986172438094187, + "learning_rate": 6.109129557560547e-06, + "loss": 0.5357, + "step": 8229 + }, + { + "epoch": 0.64, + "grad_norm": 1.2094532391772521, + "learning_rate": 6.106815005311211e-06, + "loss": 0.5506, + "step": 8230 + }, + { + "epoch": 0.64, + "grad_norm": 1.3012841887659479, + "learning_rate": 6.104500698858731e-06, + "loss": 0.5308, + "step": 8231 + }, + { + "epoch": 0.64, + "grad_norm": 1.177776835674133, + "learning_rate": 6.1021866383492205e-06, + "loss": 0.5641, + "step": 8232 + }, + { + "epoch": 0.64, + "grad_norm": 1.1547059755821678, + "learning_rate": 6.0998728239287784e-06, + "loss": 0.4707, + "step": 8233 + }, + { + "epoch": 0.64, + "grad_norm": 1.1894349846920798, + "learning_rate": 6.097559255743486e-06, + "loss": 0.5333, + "step": 8234 + }, + { + "epoch": 0.64, + "grad_norm": 1.167801861019125, + "learning_rate": 6.095245933939411e-06, + "loss": 0.5321, + "step": 8235 + }, + { + "epoch": 0.64, + "grad_norm": 1.2140268111905477, + "learning_rate": 6.092932858662604e-06, + "loss": 0.5507, + "step": 8236 + }, + { + "epoch": 0.64, + "grad_norm": 1.243314124172889, + "learning_rate": 6.0906200300591074e-06, + "loss": 0.5647, + "step": 8237 + }, + { + "epoch": 0.64, + "grad_norm": 1.0433527207528352, + "learning_rate": 6.088307448274937e-06, + "loss": 0.4527, + "step": 8238 + }, + { + "epoch": 0.64, + "grad_norm": 1.2909960954490376, + "learning_rate": 6.0859951134561e-06, + "loss": 0.5848, + "step": 8239 + }, + { + "epoch": 0.64, + "grad_norm": 1.0789159554743135, + "learning_rate": 6.083683025748584e-06, + "loss": 0.4831, + "step": 8240 + }, + { + "epoch": 0.64, + "grad_norm": 1.2012878858167118, + "learning_rate": 6.081371185298361e-06, + "loss": 0.4837, + "step": 8241 + }, + { + "epoch": 0.64, + "grad_norm": 1.304273345451697, + "learning_rate": 6.079059592251398e-06, + "loss": 0.5552, + "step": 8242 + }, + { + "epoch": 0.64, + "grad_norm": 1.1499823216447809, + "learning_rate": 6.076748246753632e-06, + "loss": 0.5411, + "step": 8243 + }, + { + "epoch": 0.64, + "grad_norm": 1.202401030070231, + "learning_rate": 6.074437148950987e-06, + "loss": 0.5333, + "step": 8244 + }, + { + "epoch": 0.64, + "grad_norm": 1.1568132523555825, + "learning_rate": 6.072126298989378e-06, + "loss": 0.5068, + "step": 8245 + }, + { + "epoch": 0.64, + "grad_norm": 1.0978312505591579, + "learning_rate": 6.069815697014701e-06, + "loss": 0.4789, + "step": 8246 + }, + { + "epoch": 0.64, + "grad_norm": 1.1675491344723636, + "learning_rate": 6.067505343172839e-06, + "loss": 0.5373, + "step": 8247 + }, + { + "epoch": 0.64, + "grad_norm": 1.2823501608692853, + "learning_rate": 6.065195237609655e-06, + "loss": 0.5246, + "step": 8248 + }, + { + "epoch": 0.64, + "grad_norm": 1.2692917818019045, + "learning_rate": 6.062885380470992e-06, + "loss": 0.524, + "step": 8249 + }, + { + "epoch": 0.64, + "grad_norm": 1.2624052559833838, + "learning_rate": 6.0605757719026884e-06, + "loss": 0.5993, + "step": 8250 + }, + { + "epoch": 0.64, + "grad_norm": 1.1384870794716562, + "learning_rate": 6.058266412050561e-06, + "loss": 0.5219, + "step": 8251 + }, + { + "epoch": 0.64, + "grad_norm": 1.1366436595956209, + "learning_rate": 6.055957301060413e-06, + "loss": 0.5149, + "step": 8252 + }, + { + "epoch": 0.64, + "grad_norm": 1.1804160085492914, + "learning_rate": 6.053648439078033e-06, + "loss": 0.5218, + "step": 8253 + }, + { + "epoch": 0.64, + "grad_norm": 1.1742472733353682, + "learning_rate": 6.05133982624918e-06, + "loss": 0.5603, + "step": 8254 + }, + { + "epoch": 0.64, + "grad_norm": 1.1696696065175627, + "learning_rate": 6.04903146271962e-06, + "loss": 0.5343, + "step": 8255 + }, + { + "epoch": 0.64, + "grad_norm": 1.0823302148606238, + "learning_rate": 6.046723348635086e-06, + "loss": 0.5017, + "step": 8256 + }, + { + "epoch": 0.64, + "grad_norm": 1.7344631825616788, + "learning_rate": 6.044415484141306e-06, + "loss": 0.5411, + "step": 8257 + }, + { + "epoch": 0.64, + "grad_norm": 1.2492929843319336, + "learning_rate": 6.042107869383982e-06, + "loss": 0.535, + "step": 8258 + }, + { + "epoch": 0.64, + "grad_norm": 1.2330886318839456, + "learning_rate": 6.039800504508813e-06, + "loss": 0.504, + "step": 8259 + }, + { + "epoch": 0.64, + "grad_norm": 1.1370313286738691, + "learning_rate": 6.0374933896614665e-06, + "loss": 0.5477, + "step": 8260 + }, + { + "epoch": 0.64, + "grad_norm": 1.235705515698876, + "learning_rate": 6.035186524987605e-06, + "loss": 0.5588, + "step": 8261 + }, + { + "epoch": 0.64, + "grad_norm": 1.2104577867851762, + "learning_rate": 6.032879910632876e-06, + "loss": 0.5505, + "step": 8262 + }, + { + "epoch": 0.64, + "grad_norm": 1.1999942123750353, + "learning_rate": 6.030573546742904e-06, + "loss": 0.5093, + "step": 8263 + }, + { + "epoch": 0.64, + "grad_norm": 1.2554652899437484, + "learning_rate": 6.028267433463309e-06, + "loss": 0.5354, + "step": 8264 + }, + { + "epoch": 0.64, + "grad_norm": 1.1396285106809467, + "learning_rate": 6.025961570939676e-06, + "loss": 0.482, + "step": 8265 + }, + { + "epoch": 0.64, + "grad_norm": 1.2477806893948147, + "learning_rate": 6.023655959317594e-06, + "loss": 0.5834, + "step": 8266 + }, + { + "epoch": 0.64, + "grad_norm": 1.178961776932395, + "learning_rate": 6.021350598742628e-06, + "loss": 0.5392, + "step": 8267 + }, + { + "epoch": 0.64, + "grad_norm": 1.2412358603387617, + "learning_rate": 6.019045489360325e-06, + "loss": 0.5755, + "step": 8268 + }, + { + "epoch": 0.64, + "grad_norm": 1.1502549718074888, + "learning_rate": 6.016740631316221e-06, + "loss": 0.5257, + "step": 8269 + }, + { + "epoch": 0.64, + "grad_norm": 1.2423343691581445, + "learning_rate": 6.01443602475583e-06, + "loss": 0.5144, + "step": 8270 + }, + { + "epoch": 0.64, + "grad_norm": 1.192894328356021, + "learning_rate": 6.0121316698246535e-06, + "loss": 0.5078, + "step": 8271 + }, + { + "epoch": 0.64, + "grad_norm": 1.2212386032556652, + "learning_rate": 6.009827566668183e-06, + "loss": 0.5382, + "step": 8272 + }, + { + "epoch": 0.64, + "grad_norm": 1.1543874947078763, + "learning_rate": 6.007523715431882e-06, + "loss": 0.5206, + "step": 8273 + }, + { + "epoch": 0.64, + "grad_norm": 1.2207255369036356, + "learning_rate": 6.0052201162612125e-06, + "loss": 0.5538, + "step": 8274 + }, + { + "epoch": 0.64, + "grad_norm": 1.0982657807048068, + "learning_rate": 6.002916769301601e-06, + "loss": 0.4986, + "step": 8275 + }, + { + "epoch": 0.64, + "grad_norm": 1.2048283693748971, + "learning_rate": 6.000613674698478e-06, + "loss": 0.5504, + "step": 8276 + }, + { + "epoch": 0.64, + "grad_norm": 1.1398957939372207, + "learning_rate": 5.99831083259725e-06, + "loss": 0.497, + "step": 8277 + }, + { + "epoch": 0.64, + "grad_norm": 1.192742370830532, + "learning_rate": 5.996008243143302e-06, + "loss": 0.5538, + "step": 8278 + }, + { + "epoch": 0.64, + "grad_norm": 1.1809412376100377, + "learning_rate": 5.993705906482016e-06, + "loss": 0.483, + "step": 8279 + }, + { + "epoch": 0.64, + "grad_norm": 1.2401758375395675, + "learning_rate": 5.991403822758741e-06, + "loss": 0.5735, + "step": 8280 + }, + { + "epoch": 0.64, + "grad_norm": 1.2014972029108515, + "learning_rate": 5.9891019921188264e-06, + "loss": 0.5472, + "step": 8281 + }, + { + "epoch": 0.64, + "grad_norm": 1.1871760076944107, + "learning_rate": 5.986800414707596e-06, + "loss": 0.5072, + "step": 8282 + }, + { + "epoch": 0.64, + "grad_norm": 1.065242873333841, + "learning_rate": 5.984499090670361e-06, + "loss": 0.4722, + "step": 8283 + }, + { + "epoch": 0.64, + "grad_norm": 1.2389210392943288, + "learning_rate": 5.982198020152419e-06, + "loss": 0.4703, + "step": 8284 + }, + { + "epoch": 0.64, + "grad_norm": 1.2349325441942252, + "learning_rate": 5.979897203299041e-06, + "loss": 0.5288, + "step": 8285 + }, + { + "epoch": 0.64, + "grad_norm": 1.1463681215681572, + "learning_rate": 5.977596640255494e-06, + "loss": 0.5446, + "step": 8286 + }, + { + "epoch": 0.64, + "grad_norm": 1.200546275411955, + "learning_rate": 5.975296331167025e-06, + "loss": 0.5458, + "step": 8287 + }, + { + "epoch": 0.64, + "grad_norm": 1.229241719082832, + "learning_rate": 5.972996276178862e-06, + "loss": 0.5615, + "step": 8288 + }, + { + "epoch": 0.64, + "grad_norm": 1.0795875801190369, + "learning_rate": 5.970696475436224e-06, + "loss": 0.5118, + "step": 8289 + }, + { + "epoch": 0.64, + "grad_norm": 1.1201241051510997, + "learning_rate": 5.968396929084303e-06, + "loss": 0.5111, + "step": 8290 + }, + { + "epoch": 0.64, + "grad_norm": 1.21812701808866, + "learning_rate": 5.966097637268284e-06, + "loss": 0.5223, + "step": 8291 + }, + { + "epoch": 0.64, + "grad_norm": 1.1651956048564998, + "learning_rate": 5.963798600133334e-06, + "loss": 0.5376, + "step": 8292 + }, + { + "epoch": 0.64, + "grad_norm": 1.0966688174272066, + "learning_rate": 5.961499817824603e-06, + "loss": 0.4903, + "step": 8293 + }, + { + "epoch": 0.64, + "grad_norm": 1.1281372407793313, + "learning_rate": 5.959201290487227e-06, + "loss": 0.5058, + "step": 8294 + }, + { + "epoch": 0.64, + "grad_norm": 1.0451609115279918, + "learning_rate": 5.956903018266317e-06, + "loss": 0.46, + "step": 8295 + }, + { + "epoch": 0.64, + "grad_norm": 1.02968762271863, + "learning_rate": 5.954605001306979e-06, + "loss": 0.4942, + "step": 8296 + }, + { + "epoch": 0.64, + "grad_norm": 1.1389752829484674, + "learning_rate": 5.952307239754302e-06, + "loss": 0.5019, + "step": 8297 + }, + { + "epoch": 0.64, + "grad_norm": 1.0504699018870116, + "learning_rate": 5.950009733753348e-06, + "loss": 0.505, + "step": 8298 + }, + { + "epoch": 0.64, + "grad_norm": 1.2733212751463376, + "learning_rate": 5.94771248344918e-06, + "loss": 0.6072, + "step": 8299 + }, + { + "epoch": 0.64, + "grad_norm": 1.1068717222418305, + "learning_rate": 5.945415488986827e-06, + "loss": 0.5147, + "step": 8300 + }, + { + "epoch": 0.64, + "grad_norm": 1.140142573927685, + "learning_rate": 5.943118750511312e-06, + "loss": 0.5457, + "step": 8301 + }, + { + "epoch": 0.64, + "grad_norm": 1.1281983687194435, + "learning_rate": 5.940822268167643e-06, + "loss": 0.5293, + "step": 8302 + }, + { + "epoch": 0.64, + "grad_norm": 1.1480746052687518, + "learning_rate": 5.938526042100805e-06, + "loss": 0.5418, + "step": 8303 + }, + { + "epoch": 0.64, + "grad_norm": 1.0439502227068225, + "learning_rate": 5.936230072455777e-06, + "loss": 0.5033, + "step": 8304 + }, + { + "epoch": 0.64, + "grad_norm": 1.1160185240059313, + "learning_rate": 5.933934359377506e-06, + "loss": 0.5487, + "step": 8305 + }, + { + "epoch": 0.64, + "grad_norm": 1.130777095772815, + "learning_rate": 5.931638903010936e-06, + "loss": 0.489, + "step": 8306 + }, + { + "epoch": 0.64, + "grad_norm": 1.2304808116503323, + "learning_rate": 5.929343703500996e-06, + "loss": 0.6044, + "step": 8307 + }, + { + "epoch": 0.64, + "grad_norm": 1.2536175356836792, + "learning_rate": 5.927048760992589e-06, + "loss": 0.5763, + "step": 8308 + }, + { + "epoch": 0.64, + "grad_norm": 1.3269634328401938, + "learning_rate": 5.9247540756306075e-06, + "loss": 0.5933, + "step": 8309 + }, + { + "epoch": 0.64, + "grad_norm": 1.0907472536488314, + "learning_rate": 5.922459647559926e-06, + "loss": 0.5136, + "step": 8310 + }, + { + "epoch": 0.64, + "grad_norm": 1.2105952609874586, + "learning_rate": 5.920165476925402e-06, + "loss": 0.5953, + "step": 8311 + }, + { + "epoch": 0.64, + "grad_norm": 1.2333478879499373, + "learning_rate": 5.917871563871884e-06, + "loss": 0.5153, + "step": 8312 + }, + { + "epoch": 0.64, + "grad_norm": 1.1397103077912958, + "learning_rate": 5.915577908544194e-06, + "loss": 0.5051, + "step": 8313 + }, + { + "epoch": 0.64, + "grad_norm": 1.2077586627129169, + "learning_rate": 5.913284511087142e-06, + "loss": 0.5548, + "step": 8314 + }, + { + "epoch": 0.65, + "grad_norm": 1.1768030941430447, + "learning_rate": 5.910991371645527e-06, + "loss": 0.5132, + "step": 8315 + }, + { + "epoch": 0.65, + "grad_norm": 1.077438135694472, + "learning_rate": 5.90869849036412e-06, + "loss": 0.4878, + "step": 8316 + }, + { + "epoch": 0.65, + "grad_norm": 1.2826787379381503, + "learning_rate": 5.906405867387688e-06, + "loss": 0.5751, + "step": 8317 + }, + { + "epoch": 0.65, + "grad_norm": 1.2665143126331475, + "learning_rate": 5.904113502860971e-06, + "loss": 0.5401, + "step": 8318 + }, + { + "epoch": 0.65, + "grad_norm": 1.1841015623389202, + "learning_rate": 5.901821396928702e-06, + "loss": 0.5724, + "step": 8319 + }, + { + "epoch": 0.65, + "grad_norm": 1.1058476206573513, + "learning_rate": 5.899529549735594e-06, + "loss": 0.4697, + "step": 8320 + }, + { + "epoch": 0.65, + "grad_norm": 1.2409869456376905, + "learning_rate": 5.897237961426339e-06, + "loss": 0.5898, + "step": 8321 + }, + { + "epoch": 0.65, + "grad_norm": 1.2756947363669284, + "learning_rate": 5.894946632145619e-06, + "loss": 0.5793, + "step": 8322 + }, + { + "epoch": 0.65, + "grad_norm": 1.1509343207981138, + "learning_rate": 5.892655562038098e-06, + "loss": 0.5017, + "step": 8323 + }, + { + "epoch": 0.65, + "grad_norm": 1.2297489543521822, + "learning_rate": 5.8903647512484205e-06, + "loss": 0.5716, + "step": 8324 + }, + { + "epoch": 0.65, + "grad_norm": 1.2654204497638248, + "learning_rate": 5.888074199921223e-06, + "loss": 0.5429, + "step": 8325 + }, + { + "epoch": 0.65, + "grad_norm": 1.17487368614918, + "learning_rate": 5.885783908201114e-06, + "loss": 0.5171, + "step": 8326 + }, + { + "epoch": 0.65, + "grad_norm": 1.1666095186042373, + "learning_rate": 5.883493876232693e-06, + "loss": 0.5151, + "step": 8327 + }, + { + "epoch": 0.65, + "grad_norm": 1.2161169218864893, + "learning_rate": 5.8812041041605426e-06, + "loss": 0.5609, + "step": 8328 + }, + { + "epoch": 0.65, + "grad_norm": 1.1686488184250539, + "learning_rate": 5.878914592129226e-06, + "loss": 0.5094, + "step": 8329 + }, + { + "epoch": 0.65, + "grad_norm": 1.1911274067741016, + "learning_rate": 5.876625340283296e-06, + "loss": 0.5063, + "step": 8330 + }, + { + "epoch": 0.65, + "grad_norm": 1.1299172596473543, + "learning_rate": 5.87433634876728e-06, + "loss": 0.501, + "step": 8331 + }, + { + "epoch": 0.65, + "grad_norm": 1.2161870074058212, + "learning_rate": 5.872047617725697e-06, + "loss": 0.5331, + "step": 8332 + }, + { + "epoch": 0.65, + "grad_norm": 1.0527753662734873, + "learning_rate": 5.869759147303042e-06, + "loss": 0.4686, + "step": 8333 + }, + { + "epoch": 0.65, + "grad_norm": 1.2055201284024752, + "learning_rate": 5.867470937643804e-06, + "loss": 0.4812, + "step": 8334 + }, + { + "epoch": 0.65, + "grad_norm": 1.1050213558843327, + "learning_rate": 5.865182988892449e-06, + "loss": 0.5358, + "step": 8335 + }, + { + "epoch": 0.65, + "grad_norm": 1.1816902087188583, + "learning_rate": 5.862895301193421e-06, + "loss": 0.5774, + "step": 8336 + }, + { + "epoch": 0.65, + "grad_norm": 1.118904128725864, + "learning_rate": 5.86060787469116e-06, + "loss": 0.4766, + "step": 8337 + }, + { + "epoch": 0.65, + "grad_norm": 1.1479559167612174, + "learning_rate": 5.858320709530077e-06, + "loss": 0.5263, + "step": 8338 + }, + { + "epoch": 0.65, + "grad_norm": 1.284340364680649, + "learning_rate": 5.8560338058545775e-06, + "loss": 0.5256, + "step": 8339 + }, + { + "epoch": 0.65, + "grad_norm": 1.149892909827278, + "learning_rate": 5.853747163809047e-06, + "loss": 0.5303, + "step": 8340 + }, + { + "epoch": 0.65, + "grad_norm": 1.0886447559560994, + "learning_rate": 5.851460783537848e-06, + "loss": 0.5177, + "step": 8341 + }, + { + "epoch": 0.65, + "grad_norm": 1.2905010173073816, + "learning_rate": 5.8491746651853305e-06, + "loss": 0.5882, + "step": 8342 + }, + { + "epoch": 0.65, + "grad_norm": 1.1416802424066883, + "learning_rate": 5.846888808895833e-06, + "loss": 0.4971, + "step": 8343 + }, + { + "epoch": 0.65, + "grad_norm": 1.2945659257553124, + "learning_rate": 5.8446032148136725e-06, + "loss": 0.5462, + "step": 8344 + }, + { + "epoch": 0.65, + "grad_norm": 1.1650222303864493, + "learning_rate": 5.842317883083153e-06, + "loss": 0.4941, + "step": 8345 + }, + { + "epoch": 0.65, + "grad_norm": 1.2245997378321456, + "learning_rate": 5.840032813848555e-06, + "loss": 0.516, + "step": 8346 + }, + { + "epoch": 0.65, + "grad_norm": 1.061853155817389, + "learning_rate": 5.837748007254146e-06, + "loss": 0.508, + "step": 8347 + }, + { + "epoch": 0.65, + "grad_norm": 1.1234121242835344, + "learning_rate": 5.835463463444179e-06, + "loss": 0.5253, + "step": 8348 + }, + { + "epoch": 0.65, + "grad_norm": 1.0684471303545413, + "learning_rate": 5.833179182562891e-06, + "loss": 0.4648, + "step": 8349 + }, + { + "epoch": 0.65, + "grad_norm": 1.184203289941239, + "learning_rate": 5.830895164754502e-06, + "loss": 0.5707, + "step": 8350 + }, + { + "epoch": 0.65, + "grad_norm": 1.118113900604512, + "learning_rate": 5.828611410163207e-06, + "loss": 0.453, + "step": 8351 + }, + { + "epoch": 0.65, + "grad_norm": 1.121939416592712, + "learning_rate": 5.826327918933197e-06, + "loss": 0.5192, + "step": 8352 + }, + { + "epoch": 0.65, + "grad_norm": 1.119115592849685, + "learning_rate": 5.824044691208641e-06, + "loss": 0.5349, + "step": 8353 + }, + { + "epoch": 0.65, + "grad_norm": 1.1414705565168812, + "learning_rate": 5.821761727133686e-06, + "loss": 0.495, + "step": 8354 + }, + { + "epoch": 0.65, + "grad_norm": 1.2759335167136183, + "learning_rate": 5.8194790268524725e-06, + "loss": 0.5713, + "step": 8355 + }, + { + "epoch": 0.65, + "grad_norm": 1.2192126643033732, + "learning_rate": 5.817196590509113e-06, + "loss": 0.5445, + "step": 8356 + }, + { + "epoch": 0.65, + "grad_norm": 1.4275029069166099, + "learning_rate": 5.814914418247714e-06, + "loss": 0.5057, + "step": 8357 + }, + { + "epoch": 0.65, + "grad_norm": 1.2192946464150107, + "learning_rate": 5.812632510212359e-06, + "loss": 0.548, + "step": 8358 + }, + { + "epoch": 0.65, + "grad_norm": 1.169197329424752, + "learning_rate": 5.810350866547119e-06, + "loss": 0.5311, + "step": 8359 + }, + { + "epoch": 0.65, + "grad_norm": 1.1895552968709187, + "learning_rate": 5.808069487396048e-06, + "loss": 0.4849, + "step": 8360 + }, + { + "epoch": 0.65, + "grad_norm": 1.0370600054640842, + "learning_rate": 5.805788372903174e-06, + "loss": 0.496, + "step": 8361 + }, + { + "epoch": 0.65, + "grad_norm": 1.133862607215138, + "learning_rate": 5.80350752321252e-06, + "loss": 0.5086, + "step": 8362 + }, + { + "epoch": 0.65, + "grad_norm": 1.1106481634074084, + "learning_rate": 5.801226938468089e-06, + "loss": 0.4932, + "step": 8363 + }, + { + "epoch": 0.65, + "grad_norm": 1.128475331359232, + "learning_rate": 5.798946618813861e-06, + "loss": 0.4842, + "step": 8364 + }, + { + "epoch": 0.65, + "grad_norm": 1.1648247290631113, + "learning_rate": 5.796666564393811e-06, + "loss": 0.5588, + "step": 8365 + }, + { + "epoch": 0.65, + "grad_norm": 1.1704829785764537, + "learning_rate": 5.7943867753518845e-06, + "loss": 0.5009, + "step": 8366 + }, + { + "epoch": 0.65, + "grad_norm": 1.0497571073883374, + "learning_rate": 5.792107251832018e-06, + "loss": 0.4907, + "step": 8367 + }, + { + "epoch": 0.65, + "grad_norm": 1.202705756129117, + "learning_rate": 5.789827993978131e-06, + "loss": 0.5751, + "step": 8368 + }, + { + "epoch": 0.65, + "grad_norm": 1.2167405900750052, + "learning_rate": 5.787549001934125e-06, + "loss": 0.5303, + "step": 8369 + }, + { + "epoch": 0.65, + "grad_norm": 1.196261658646584, + "learning_rate": 5.785270275843883e-06, + "loss": 0.5101, + "step": 8370 + }, + { + "epoch": 0.65, + "grad_norm": 1.207802436658382, + "learning_rate": 5.7829918158512774e-06, + "loss": 0.5534, + "step": 8371 + }, + { + "epoch": 0.65, + "grad_norm": 1.1272561444394045, + "learning_rate": 5.7807136221001515e-06, + "loss": 0.5044, + "step": 8372 + }, + { + "epoch": 0.65, + "grad_norm": 1.1363927104871914, + "learning_rate": 5.778435694734348e-06, + "loss": 0.5429, + "step": 8373 + }, + { + "epoch": 0.65, + "grad_norm": 1.1579923647067174, + "learning_rate": 5.776158033897674e-06, + "loss": 0.5115, + "step": 8374 + }, + { + "epoch": 0.65, + "grad_norm": 1.3101519516912479, + "learning_rate": 5.773880639733938e-06, + "loss": 0.623, + "step": 8375 + }, + { + "epoch": 0.65, + "grad_norm": 1.2101153135917795, + "learning_rate": 5.771603512386923e-06, + "loss": 0.535, + "step": 8376 + }, + { + "epoch": 0.65, + "grad_norm": 1.0191385967311948, + "learning_rate": 5.769326652000391e-06, + "loss": 0.4757, + "step": 8377 + }, + { + "epoch": 0.65, + "grad_norm": 1.160298707994621, + "learning_rate": 5.7670500587180935e-06, + "loss": 0.5258, + "step": 8378 + }, + { + "epoch": 0.65, + "grad_norm": 1.0811363403028997, + "learning_rate": 5.764773732683766e-06, + "loss": 0.5088, + "step": 8379 + }, + { + "epoch": 0.65, + "grad_norm": 1.2227004655636986, + "learning_rate": 5.7624976740411244e-06, + "loss": 0.5118, + "step": 8380 + }, + { + "epoch": 0.65, + "grad_norm": 1.195517079916166, + "learning_rate": 5.76022188293387e-06, + "loss": 0.5365, + "step": 8381 + }, + { + "epoch": 0.65, + "grad_norm": 1.1263613940665935, + "learning_rate": 5.757946359505679e-06, + "loss": 0.5192, + "step": 8382 + }, + { + "epoch": 0.65, + "grad_norm": 1.2229793231429569, + "learning_rate": 5.755671103900225e-06, + "loss": 0.5266, + "step": 8383 + }, + { + "epoch": 0.65, + "grad_norm": 1.245085592430017, + "learning_rate": 5.753396116261148e-06, + "loss": 0.5082, + "step": 8384 + }, + { + "epoch": 0.65, + "grad_norm": 1.1785393995416413, + "learning_rate": 5.751121396732082e-06, + "loss": 0.4906, + "step": 8385 + }, + { + "epoch": 0.65, + "grad_norm": 1.2330567769407892, + "learning_rate": 5.7488469454566484e-06, + "loss": 0.5328, + "step": 8386 + }, + { + "epoch": 0.65, + "grad_norm": 1.175403217363587, + "learning_rate": 5.746572762578437e-06, + "loss": 0.5163, + "step": 8387 + }, + { + "epoch": 0.65, + "grad_norm": 1.206452956444843, + "learning_rate": 5.744298848241032e-06, + "loss": 0.5356, + "step": 8388 + }, + { + "epoch": 0.65, + "grad_norm": 1.2145841265894468, + "learning_rate": 5.742025202587997e-06, + "loss": 0.5426, + "step": 8389 + }, + { + "epoch": 0.65, + "grad_norm": 1.2460494078396926, + "learning_rate": 5.739751825762878e-06, + "loss": 0.6032, + "step": 8390 + }, + { + "epoch": 0.65, + "grad_norm": 1.0768910409149992, + "learning_rate": 5.7374787179092106e-06, + "loss": 0.5589, + "step": 8391 + }, + { + "epoch": 0.65, + "grad_norm": 1.1603555732867057, + "learning_rate": 5.7352058791705e-06, + "loss": 0.5156, + "step": 8392 + }, + { + "epoch": 0.65, + "grad_norm": 1.1952020930742986, + "learning_rate": 5.732933309690251e-06, + "loss": 0.5132, + "step": 8393 + }, + { + "epoch": 0.65, + "grad_norm": 1.2150487696117778, + "learning_rate": 5.730661009611931e-06, + "loss": 0.5564, + "step": 8394 + }, + { + "epoch": 0.65, + "grad_norm": 1.0558316841403654, + "learning_rate": 5.7283889790790096e-06, + "loss": 0.4844, + "step": 8395 + }, + { + "epoch": 0.65, + "grad_norm": 1.1781275458270426, + "learning_rate": 5.7261172182349344e-06, + "loss": 0.574, + "step": 8396 + }, + { + "epoch": 0.65, + "grad_norm": 1.2799412160279555, + "learning_rate": 5.723845727223125e-06, + "loss": 0.5643, + "step": 8397 + }, + { + "epoch": 0.65, + "grad_norm": 1.057899726190161, + "learning_rate": 5.721574506186998e-06, + "loss": 0.4961, + "step": 8398 + }, + { + "epoch": 0.65, + "grad_norm": 1.0321424264100818, + "learning_rate": 5.719303555269946e-06, + "loss": 0.4315, + "step": 8399 + }, + { + "epoch": 0.65, + "grad_norm": 1.1565343146405827, + "learning_rate": 5.717032874615345e-06, + "loss": 0.5138, + "step": 8400 + }, + { + "epoch": 0.65, + "grad_norm": 1.1410248460563566, + "learning_rate": 5.714762464366561e-06, + "loss": 0.4653, + "step": 8401 + }, + { + "epoch": 0.65, + "grad_norm": 1.128930220299961, + "learning_rate": 5.712492324666927e-06, + "loss": 0.5215, + "step": 8402 + }, + { + "epoch": 0.65, + "grad_norm": 1.2231399992898826, + "learning_rate": 5.7102224556597775e-06, + "loss": 0.5664, + "step": 8403 + }, + { + "epoch": 0.65, + "grad_norm": 1.2105101292693339, + "learning_rate": 5.7079528574884125e-06, + "loss": 0.5789, + "step": 8404 + }, + { + "epoch": 0.65, + "grad_norm": 1.1467437710639632, + "learning_rate": 5.7056835302961266e-06, + "loss": 0.4865, + "step": 8405 + }, + { + "epoch": 0.65, + "grad_norm": 1.1429240950063595, + "learning_rate": 5.703414474226201e-06, + "loss": 0.5174, + "step": 8406 + }, + { + "epoch": 0.65, + "grad_norm": 1.1908314303166476, + "learning_rate": 5.701145689421882e-06, + "loss": 0.5352, + "step": 8407 + }, + { + "epoch": 0.65, + "grad_norm": 1.2017686307418005, + "learning_rate": 5.698877176026415e-06, + "loss": 0.5329, + "step": 8408 + }, + { + "epoch": 0.65, + "grad_norm": 1.210155012747007, + "learning_rate": 5.696608934183023e-06, + "loss": 0.5465, + "step": 8409 + }, + { + "epoch": 0.65, + "grad_norm": 1.186342227112146, + "learning_rate": 5.694340964034911e-06, + "loss": 0.5734, + "step": 8410 + }, + { + "epoch": 0.65, + "grad_norm": 1.1401453969490392, + "learning_rate": 5.692073265725273e-06, + "loss": 0.484, + "step": 8411 + }, + { + "epoch": 0.65, + "grad_norm": 1.0952495105170241, + "learning_rate": 5.689805839397271e-06, + "loss": 0.5017, + "step": 8412 + }, + { + "epoch": 0.65, + "grad_norm": 1.2199979896997868, + "learning_rate": 5.687538685194069e-06, + "loss": 0.4978, + "step": 8413 + }, + { + "epoch": 0.65, + "grad_norm": 1.077079431808361, + "learning_rate": 5.685271803258794e-06, + "loss": 0.4802, + "step": 8414 + }, + { + "epoch": 0.65, + "grad_norm": 1.2055374333610978, + "learning_rate": 5.683005193734572e-06, + "loss": 0.5326, + "step": 8415 + }, + { + "epoch": 0.65, + "grad_norm": 1.1562690733290126, + "learning_rate": 5.680738856764508e-06, + "loss": 0.5369, + "step": 8416 + }, + { + "epoch": 0.65, + "grad_norm": 1.1738617395548328, + "learning_rate": 5.67847279249168e-06, + "loss": 0.5227, + "step": 8417 + }, + { + "epoch": 0.65, + "grad_norm": 1.2168957224660126, + "learning_rate": 5.676207001059163e-06, + "loss": 0.5432, + "step": 8418 + }, + { + "epoch": 0.65, + "grad_norm": 1.0244021346831507, + "learning_rate": 5.673941482610004e-06, + "loss": 0.4651, + "step": 8419 + }, + { + "epoch": 0.65, + "grad_norm": 1.1248860301462438, + "learning_rate": 5.67167623728724e-06, + "loss": 0.4723, + "step": 8420 + }, + { + "epoch": 0.65, + "grad_norm": 1.0739386175931611, + "learning_rate": 5.6694112652338895e-06, + "loss": 0.5137, + "step": 8421 + }, + { + "epoch": 0.65, + "grad_norm": 1.059763301027072, + "learning_rate": 5.667146566592945e-06, + "loss": 0.4813, + "step": 8422 + }, + { + "epoch": 0.65, + "grad_norm": 1.20870554332382, + "learning_rate": 5.6648821415073965e-06, + "loss": 0.49, + "step": 8423 + }, + { + "epoch": 0.65, + "grad_norm": 1.0993197179864558, + "learning_rate": 5.662617990120201e-06, + "loss": 0.4769, + "step": 8424 + }, + { + "epoch": 0.65, + "grad_norm": 1.270928749296342, + "learning_rate": 5.660354112574309e-06, + "loss": 0.5251, + "step": 8425 + }, + { + "epoch": 0.65, + "grad_norm": 1.147109373087845, + "learning_rate": 5.658090509012651e-06, + "loss": 0.5149, + "step": 8426 + }, + { + "epoch": 0.65, + "grad_norm": 1.2236796717945462, + "learning_rate": 5.655827179578145e-06, + "loss": 0.5439, + "step": 8427 + }, + { + "epoch": 0.65, + "grad_norm": 1.1690986296927226, + "learning_rate": 5.653564124413678e-06, + "loss": 0.4971, + "step": 8428 + }, + { + "epoch": 0.65, + "grad_norm": 1.1824483837612974, + "learning_rate": 5.651301343662132e-06, + "loss": 0.5388, + "step": 8429 + }, + { + "epoch": 0.65, + "grad_norm": 1.0618730827255858, + "learning_rate": 5.649038837466369e-06, + "loss": 0.4805, + "step": 8430 + }, + { + "epoch": 0.65, + "grad_norm": 1.1365684069022948, + "learning_rate": 5.646776605969237e-06, + "loss": 0.5238, + "step": 8431 + }, + { + "epoch": 0.65, + "grad_norm": 1.1641473931201003, + "learning_rate": 5.644514649313554e-06, + "loss": 0.5511, + "step": 8432 + }, + { + "epoch": 0.65, + "grad_norm": 1.2852973744554859, + "learning_rate": 5.642252967642134e-06, + "loss": 0.531, + "step": 8433 + }, + { + "epoch": 0.65, + "grad_norm": 1.2137818433078988, + "learning_rate": 5.639991561097767e-06, + "loss": 0.5753, + "step": 8434 + }, + { + "epoch": 0.65, + "grad_norm": 1.1422237009555516, + "learning_rate": 5.637730429823224e-06, + "loss": 0.5511, + "step": 8435 + }, + { + "epoch": 0.65, + "grad_norm": 1.1509193021654671, + "learning_rate": 5.6354695739612665e-06, + "loss": 0.4956, + "step": 8436 + }, + { + "epoch": 0.65, + "grad_norm": 1.1055873631982116, + "learning_rate": 5.6332089936546375e-06, + "loss": 0.5111, + "step": 8437 + }, + { + "epoch": 0.65, + "grad_norm": 1.1053391774942984, + "learning_rate": 5.6309486890460494e-06, + "loss": 0.5161, + "step": 8438 + }, + { + "epoch": 0.65, + "grad_norm": 1.1290814748817206, + "learning_rate": 5.628688660278212e-06, + "loss": 0.5155, + "step": 8439 + }, + { + "epoch": 0.65, + "grad_norm": 1.1467821816452228, + "learning_rate": 5.6264289074938126e-06, + "loss": 0.5373, + "step": 8440 + }, + { + "epoch": 0.65, + "grad_norm": 1.32883142869552, + "learning_rate": 5.624169430835524e-06, + "loss": 0.5678, + "step": 8441 + }, + { + "epoch": 0.65, + "grad_norm": 1.2227426809163227, + "learning_rate": 5.621910230445993e-06, + "loss": 0.5664, + "step": 8442 + }, + { + "epoch": 0.65, + "grad_norm": 1.2090601982041547, + "learning_rate": 5.619651306467861e-06, + "loss": 0.5866, + "step": 8443 + }, + { + "epoch": 0.66, + "grad_norm": 1.195964809770765, + "learning_rate": 5.617392659043737e-06, + "loss": 0.4943, + "step": 8444 + }, + { + "epoch": 0.66, + "grad_norm": 1.077583453405085, + "learning_rate": 5.615134288316227e-06, + "loss": 0.551, + "step": 8445 + }, + { + "epoch": 0.66, + "grad_norm": 1.1009499761008235, + "learning_rate": 5.612876194427911e-06, + "loss": 0.526, + "step": 8446 + }, + { + "epoch": 0.66, + "grad_norm": 1.0947278692287028, + "learning_rate": 5.61061837752136e-06, + "loss": 0.489, + "step": 8447 + }, + { + "epoch": 0.66, + "grad_norm": 1.1213544482655873, + "learning_rate": 5.608360837739113e-06, + "loss": 0.5151, + "step": 8448 + }, + { + "epoch": 0.66, + "grad_norm": 1.2054608941208498, + "learning_rate": 5.6061035752237035e-06, + "loss": 0.4951, + "step": 8449 + }, + { + "epoch": 0.66, + "grad_norm": 1.1468636764258395, + "learning_rate": 5.6038465901176455e-06, + "loss": 0.5374, + "step": 8450 + }, + { + "epoch": 0.66, + "grad_norm": 1.1641627530956273, + "learning_rate": 5.601589882563436e-06, + "loss": 0.493, + "step": 8451 + }, + { + "epoch": 0.66, + "grad_norm": 1.1911011852486217, + "learning_rate": 5.599333452703548e-06, + "loss": 0.5355, + "step": 8452 + }, + { + "epoch": 0.66, + "grad_norm": 1.1434821831499857, + "learning_rate": 5.5970773006804465e-06, + "loss": 0.5303, + "step": 8453 + }, + { + "epoch": 0.66, + "grad_norm": 1.1601964256445938, + "learning_rate": 5.594821426636567e-06, + "loss": 0.5065, + "step": 8454 + }, + { + "epoch": 0.66, + "grad_norm": 1.0777001580464676, + "learning_rate": 5.5925658307143405e-06, + "loss": 0.4361, + "step": 8455 + }, + { + "epoch": 0.66, + "grad_norm": 1.1149810663677027, + "learning_rate": 5.590310513056171e-06, + "loss": 0.5034, + "step": 8456 + }, + { + "epoch": 0.66, + "grad_norm": 1.0965889734402938, + "learning_rate": 5.588055473804453e-06, + "loss": 0.5395, + "step": 8457 + }, + { + "epoch": 0.66, + "grad_norm": 1.2157905054077165, + "learning_rate": 5.585800713101552e-06, + "loss": 0.5396, + "step": 8458 + }, + { + "epoch": 0.66, + "grad_norm": 1.1496163370930719, + "learning_rate": 5.583546231089827e-06, + "loss": 0.5204, + "step": 8459 + }, + { + "epoch": 0.66, + "grad_norm": 1.1547193447981894, + "learning_rate": 5.581292027911614e-06, + "loss": 0.5335, + "step": 8460 + }, + { + "epoch": 0.66, + "grad_norm": 1.1413997475023059, + "learning_rate": 5.579038103709238e-06, + "loss": 0.5011, + "step": 8461 + }, + { + "epoch": 0.66, + "grad_norm": 1.1848394806704596, + "learning_rate": 5.576784458624991e-06, + "loss": 0.5875, + "step": 8462 + }, + { + "epoch": 0.66, + "grad_norm": 1.1876907195479356, + "learning_rate": 5.5745310928011656e-06, + "loss": 0.5729, + "step": 8463 + }, + { + "epoch": 0.66, + "grad_norm": 1.2527726893428552, + "learning_rate": 5.57227800638002e-06, + "loss": 0.5458, + "step": 8464 + }, + { + "epoch": 0.66, + "grad_norm": 1.1716997651371692, + "learning_rate": 5.570025199503808e-06, + "loss": 0.5917, + "step": 8465 + }, + { + "epoch": 0.66, + "grad_norm": 1.1851059220855302, + "learning_rate": 5.567772672314762e-06, + "loss": 0.5326, + "step": 8466 + }, + { + "epoch": 0.66, + "grad_norm": 1.1172082538944497, + "learning_rate": 5.565520424955097e-06, + "loss": 0.4709, + "step": 8467 + }, + { + "epoch": 0.66, + "grad_norm": 1.2249893985990217, + "learning_rate": 5.563268457567004e-06, + "loss": 0.5497, + "step": 8468 + }, + { + "epoch": 0.66, + "grad_norm": 1.1785875459700161, + "learning_rate": 5.561016770292662e-06, + "loss": 0.5436, + "step": 8469 + }, + { + "epoch": 0.66, + "grad_norm": 1.2230458476751565, + "learning_rate": 5.558765363274234e-06, + "loss": 0.5536, + "step": 8470 + }, + { + "epoch": 0.66, + "grad_norm": 1.1308483063941799, + "learning_rate": 5.556514236653867e-06, + "loss": 0.5109, + "step": 8471 + }, + { + "epoch": 0.66, + "grad_norm": 1.1662427211423936, + "learning_rate": 5.5542633905736775e-06, + "loss": 0.5295, + "step": 8472 + }, + { + "epoch": 0.66, + "grad_norm": 1.1748412674356221, + "learning_rate": 5.552012825175781e-06, + "loss": 0.5335, + "step": 8473 + }, + { + "epoch": 0.66, + "grad_norm": 1.2096217729218548, + "learning_rate": 5.549762540602261e-06, + "loss": 0.5262, + "step": 8474 + }, + { + "epoch": 0.66, + "grad_norm": 1.2318628073852937, + "learning_rate": 5.54751253699519e-06, + "loss": 0.5159, + "step": 8475 + }, + { + "epoch": 0.66, + "grad_norm": 1.1905554563382166, + "learning_rate": 5.545262814496625e-06, + "loss": 0.5302, + "step": 8476 + }, + { + "epoch": 0.66, + "grad_norm": 1.1472164591618066, + "learning_rate": 5.543013373248601e-06, + "loss": 0.5139, + "step": 8477 + }, + { + "epoch": 0.66, + "grad_norm": 1.238370537171197, + "learning_rate": 5.540764213393144e-06, + "loss": 0.5426, + "step": 8478 + }, + { + "epoch": 0.66, + "grad_norm": 1.1846252579615788, + "learning_rate": 5.538515335072243e-06, + "loss": 0.5121, + "step": 8479 + }, + { + "epoch": 0.66, + "grad_norm": 1.2225057003457815, + "learning_rate": 5.536266738427886e-06, + "loss": 0.5435, + "step": 8480 + }, + { + "epoch": 0.66, + "grad_norm": 1.2056883715105913, + "learning_rate": 5.534018423602047e-06, + "loss": 0.5828, + "step": 8481 + }, + { + "epoch": 0.66, + "grad_norm": 1.2752135827182332, + "learning_rate": 5.531770390736659e-06, + "loss": 0.5625, + "step": 8482 + }, + { + "epoch": 0.66, + "grad_norm": 1.1836965283378318, + "learning_rate": 5.529522639973666e-06, + "loss": 0.5268, + "step": 8483 + }, + { + "epoch": 0.66, + "grad_norm": 1.223927625987873, + "learning_rate": 5.527275171454969e-06, + "loss": 0.4761, + "step": 8484 + }, + { + "epoch": 0.66, + "grad_norm": 1.2690409491164432, + "learning_rate": 5.525027985322464e-06, + "loss": 0.5811, + "step": 8485 + }, + { + "epoch": 0.66, + "grad_norm": 1.160096394956788, + "learning_rate": 5.5227810817180325e-06, + "loss": 0.478, + "step": 8486 + }, + { + "epoch": 0.66, + "grad_norm": 1.2689591747417468, + "learning_rate": 5.520534460783531e-06, + "loss": 0.5614, + "step": 8487 + }, + { + "epoch": 0.66, + "grad_norm": 1.1784244875112335, + "learning_rate": 5.5182881226608035e-06, + "loss": 0.5059, + "step": 8488 + }, + { + "epoch": 0.66, + "grad_norm": 1.1366005013432052, + "learning_rate": 5.516042067491665e-06, + "loss": 0.5046, + "step": 8489 + }, + { + "epoch": 0.66, + "grad_norm": 1.1004874753191654, + "learning_rate": 5.513796295417925e-06, + "loss": 0.486, + "step": 8490 + }, + { + "epoch": 0.66, + "grad_norm": 1.1285325326045819, + "learning_rate": 5.511550806581374e-06, + "loss": 0.5122, + "step": 8491 + }, + { + "epoch": 0.66, + "grad_norm": 1.1319279867794765, + "learning_rate": 5.5093056011237755e-06, + "loss": 0.4847, + "step": 8492 + }, + { + "epoch": 0.66, + "grad_norm": 1.1975101389286498, + "learning_rate": 5.507060679186886e-06, + "loss": 0.5649, + "step": 8493 + }, + { + "epoch": 0.66, + "grad_norm": 1.0671562938679564, + "learning_rate": 5.504816040912433e-06, + "loss": 0.5085, + "step": 8494 + }, + { + "epoch": 0.66, + "grad_norm": 1.0458487419624567, + "learning_rate": 5.5025716864421356e-06, + "loss": 0.4772, + "step": 8495 + }, + { + "epoch": 0.66, + "grad_norm": 1.240019195392685, + "learning_rate": 5.500327615917691e-06, + "loss": 0.5743, + "step": 8496 + }, + { + "epoch": 0.66, + "grad_norm": 1.2476582526547406, + "learning_rate": 5.498083829480778e-06, + "loss": 0.5587, + "step": 8497 + }, + { + "epoch": 0.66, + "grad_norm": 1.0980409647138942, + "learning_rate": 5.495840327273065e-06, + "loss": 0.5549, + "step": 8498 + }, + { + "epoch": 0.66, + "grad_norm": 1.0337226585453367, + "learning_rate": 5.493597109436186e-06, + "loss": 0.456, + "step": 8499 + }, + { + "epoch": 0.66, + "grad_norm": 1.113755724000618, + "learning_rate": 5.491354176111773e-06, + "loss": 0.5262, + "step": 8500 + }, + { + "epoch": 0.66, + "grad_norm": 1.0530288086380135, + "learning_rate": 5.489111527441435e-06, + "loss": 0.5012, + "step": 8501 + }, + { + "epoch": 0.66, + "grad_norm": 1.2221671396664937, + "learning_rate": 5.486869163566756e-06, + "loss": 0.5657, + "step": 8502 + }, + { + "epoch": 0.66, + "grad_norm": 1.070779461872384, + "learning_rate": 5.4846270846293145e-06, + "loss": 0.4805, + "step": 8503 + }, + { + "epoch": 0.66, + "grad_norm": 1.157434938880406, + "learning_rate": 5.4823852907706585e-06, + "loss": 0.4983, + "step": 8504 + }, + { + "epoch": 0.66, + "grad_norm": 1.1750843606249153, + "learning_rate": 5.480143782132327e-06, + "loss": 0.5561, + "step": 8505 + }, + { + "epoch": 0.66, + "grad_norm": 1.20566795418941, + "learning_rate": 5.477902558855837e-06, + "loss": 0.5203, + "step": 8506 + }, + { + "epoch": 0.66, + "grad_norm": 1.1302033466750618, + "learning_rate": 5.475661621082689e-06, + "loss": 0.5136, + "step": 8507 + }, + { + "epoch": 0.66, + "grad_norm": 1.2599388299202012, + "learning_rate": 5.4734209689543705e-06, + "loss": 0.5404, + "step": 8508 + }, + { + "epoch": 0.66, + "grad_norm": 1.1136368568398085, + "learning_rate": 5.471180602612336e-06, + "loss": 0.453, + "step": 8509 + }, + { + "epoch": 0.66, + "grad_norm": 1.1352565314149756, + "learning_rate": 5.468940522198036e-06, + "loss": 0.4893, + "step": 8510 + }, + { + "epoch": 0.66, + "grad_norm": 1.2023863568604916, + "learning_rate": 5.4667007278529015e-06, + "loss": 0.496, + "step": 8511 + }, + { + "epoch": 0.66, + "grad_norm": 1.1360211411107353, + "learning_rate": 5.464461219718336e-06, + "loss": 0.4935, + "step": 8512 + }, + { + "epoch": 0.66, + "grad_norm": 1.1070736399781234, + "learning_rate": 5.462221997935737e-06, + "loss": 0.4995, + "step": 8513 + }, + { + "epoch": 0.66, + "grad_norm": 1.260995616306245, + "learning_rate": 5.45998306264647e-06, + "loss": 0.4525, + "step": 8514 + }, + { + "epoch": 0.66, + "grad_norm": 1.2252535791101584, + "learning_rate": 5.457744413991897e-06, + "loss": 0.5422, + "step": 8515 + }, + { + "epoch": 0.66, + "grad_norm": 1.2797914902909833, + "learning_rate": 5.455506052113354e-06, + "loss": 0.5789, + "step": 8516 + }, + { + "epoch": 0.66, + "grad_norm": 1.1960791329674854, + "learning_rate": 5.453267977152161e-06, + "loss": 0.5278, + "step": 8517 + }, + { + "epoch": 0.66, + "grad_norm": 1.2269451734982912, + "learning_rate": 5.4510301892496224e-06, + "loss": 0.5339, + "step": 8518 + }, + { + "epoch": 0.66, + "grad_norm": 1.1615682929066196, + "learning_rate": 5.448792688547012e-06, + "loss": 0.4901, + "step": 8519 + }, + { + "epoch": 0.66, + "grad_norm": 1.1604961064448818, + "learning_rate": 5.446555475185602e-06, + "loss": 0.5481, + "step": 8520 + }, + { + "epoch": 0.66, + "grad_norm": 1.2052080526436777, + "learning_rate": 5.444318549306641e-06, + "loss": 0.5326, + "step": 8521 + }, + { + "epoch": 0.66, + "grad_norm": 1.240185257547323, + "learning_rate": 5.44208191105135e-06, + "loss": 0.5421, + "step": 8522 + }, + { + "epoch": 0.66, + "grad_norm": 1.1496226624613037, + "learning_rate": 5.439845560560948e-06, + "loss": 0.5259, + "step": 8523 + }, + { + "epoch": 0.66, + "grad_norm": 1.2448385966670596, + "learning_rate": 5.437609497976619e-06, + "loss": 0.5128, + "step": 8524 + }, + { + "epoch": 0.66, + "grad_norm": 1.1222563772753282, + "learning_rate": 5.435373723439541e-06, + "loss": 0.4668, + "step": 8525 + }, + { + "epoch": 0.66, + "grad_norm": 1.2988610284982853, + "learning_rate": 5.43313823709087e-06, + "loss": 0.5933, + "step": 8526 + }, + { + "epoch": 0.66, + "grad_norm": 0.9877893180987131, + "learning_rate": 5.430903039071744e-06, + "loss": 0.4621, + "step": 8527 + }, + { + "epoch": 0.66, + "grad_norm": 1.171360716823358, + "learning_rate": 5.428668129523288e-06, + "loss": 0.5426, + "step": 8528 + }, + { + "epoch": 0.66, + "grad_norm": 1.1813154293286559, + "learning_rate": 5.426433508586593e-06, + "loss": 0.51, + "step": 8529 + }, + { + "epoch": 0.66, + "grad_norm": 1.2110931788632529, + "learning_rate": 5.4241991764027464e-06, + "loss": 0.5136, + "step": 8530 + }, + { + "epoch": 0.66, + "grad_norm": 1.1125414615577245, + "learning_rate": 5.421965133112818e-06, + "loss": 0.5046, + "step": 8531 + }, + { + "epoch": 0.66, + "grad_norm": 1.2388047035525844, + "learning_rate": 5.419731378857849e-06, + "loss": 0.5183, + "step": 8532 + }, + { + "epoch": 0.66, + "grad_norm": 1.2087526361225658, + "learning_rate": 5.417497913778866e-06, + "loss": 0.5537, + "step": 8533 + }, + { + "epoch": 0.66, + "grad_norm": 1.1528444398553923, + "learning_rate": 5.4152647380168876e-06, + "loss": 0.5384, + "step": 8534 + }, + { + "epoch": 0.66, + "grad_norm": 1.2111791554175395, + "learning_rate": 5.413031851712895e-06, + "loss": 0.5742, + "step": 8535 + }, + { + "epoch": 0.66, + "grad_norm": 1.2373031459639174, + "learning_rate": 5.410799255007868e-06, + "loss": 0.5383, + "step": 8536 + }, + { + "epoch": 0.66, + "grad_norm": 1.2333312148428242, + "learning_rate": 5.408566948042762e-06, + "loss": 0.5188, + "step": 8537 + }, + { + "epoch": 0.66, + "grad_norm": 1.103125903423685, + "learning_rate": 5.406334930958513e-06, + "loss": 0.4999, + "step": 8538 + }, + { + "epoch": 0.66, + "grad_norm": 1.2777737235037205, + "learning_rate": 5.404103203896044e-06, + "loss": 0.5057, + "step": 8539 + }, + { + "epoch": 0.66, + "grad_norm": 1.2015502333790002, + "learning_rate": 5.401871766996247e-06, + "loss": 0.5329, + "step": 8540 + }, + { + "epoch": 0.66, + "grad_norm": 1.23725940407713, + "learning_rate": 5.399640620400013e-06, + "loss": 0.5443, + "step": 8541 + }, + { + "epoch": 0.66, + "grad_norm": 1.2897518742150835, + "learning_rate": 5.397409764248197e-06, + "loss": 0.5461, + "step": 8542 + }, + { + "epoch": 0.66, + "grad_norm": 1.1129164784120786, + "learning_rate": 5.395179198681648e-06, + "loss": 0.5037, + "step": 8543 + }, + { + "epoch": 0.66, + "grad_norm": 1.149926083718583, + "learning_rate": 5.392948923841199e-06, + "loss": 0.5661, + "step": 8544 + }, + { + "epoch": 0.66, + "grad_norm": 1.1224459159752647, + "learning_rate": 5.39071893986765e-06, + "loss": 0.4964, + "step": 8545 + }, + { + "epoch": 0.66, + "grad_norm": 1.277206413515417, + "learning_rate": 5.388489246901796e-06, + "loss": 0.5627, + "step": 8546 + }, + { + "epoch": 0.66, + "grad_norm": 1.2132288266245856, + "learning_rate": 5.386259845084405e-06, + "loss": 0.5584, + "step": 8547 + }, + { + "epoch": 0.66, + "grad_norm": 1.1921675454165848, + "learning_rate": 5.384030734556236e-06, + "loss": 0.5132, + "step": 8548 + }, + { + "epoch": 0.66, + "grad_norm": 1.2253165264301842, + "learning_rate": 5.381801915458026e-06, + "loss": 0.5458, + "step": 8549 + }, + { + "epoch": 0.66, + "grad_norm": 1.1537922693777483, + "learning_rate": 5.379573387930484e-06, + "loss": 0.4989, + "step": 8550 + }, + { + "epoch": 0.66, + "grad_norm": 1.1037426200265723, + "learning_rate": 5.377345152114315e-06, + "loss": 0.4588, + "step": 8551 + }, + { + "epoch": 0.66, + "grad_norm": 1.2108923134525391, + "learning_rate": 5.3751172081501935e-06, + "loss": 0.517, + "step": 8552 + }, + { + "epoch": 0.66, + "grad_norm": 1.3200071986320028, + "learning_rate": 5.372889556178782e-06, + "loss": 0.6122, + "step": 8553 + }, + { + "epoch": 0.66, + "grad_norm": 1.1628269064056198, + "learning_rate": 5.370662196340732e-06, + "loss": 0.5179, + "step": 8554 + }, + { + "epoch": 0.66, + "grad_norm": 1.228707448663035, + "learning_rate": 5.368435128776657e-06, + "loss": 0.5027, + "step": 8555 + }, + { + "epoch": 0.66, + "grad_norm": 1.142716410819025, + "learning_rate": 5.366208353627167e-06, + "loss": 0.4519, + "step": 8556 + }, + { + "epoch": 0.66, + "grad_norm": 1.3166599199065854, + "learning_rate": 5.363981871032852e-06, + "loss": 0.5132, + "step": 8557 + }, + { + "epoch": 0.66, + "grad_norm": 1.1404255731857622, + "learning_rate": 5.36175568113428e-06, + "loss": 0.4961, + "step": 8558 + }, + { + "epoch": 0.66, + "grad_norm": 1.214787227382845, + "learning_rate": 5.359529784072006e-06, + "loss": 0.6058, + "step": 8559 + }, + { + "epoch": 0.66, + "grad_norm": 1.2205537506263417, + "learning_rate": 5.357304179986553e-06, + "loss": 0.5143, + "step": 8560 + }, + { + "epoch": 0.66, + "grad_norm": 1.1882664064462138, + "learning_rate": 5.355078869018446e-06, + "loss": 0.5136, + "step": 8561 + }, + { + "epoch": 0.66, + "grad_norm": 1.1278910576616812, + "learning_rate": 5.3528538513081705e-06, + "loss": 0.5075, + "step": 8562 + }, + { + "epoch": 0.66, + "grad_norm": 1.177618017822402, + "learning_rate": 5.350629126996207e-06, + "loss": 0.571, + "step": 8563 + }, + { + "epoch": 0.66, + "grad_norm": 1.219897732356841, + "learning_rate": 5.34840469622302e-06, + "loss": 0.5951, + "step": 8564 + }, + { + "epoch": 0.66, + "grad_norm": 1.1129376868449574, + "learning_rate": 5.346180559129037e-06, + "loss": 0.5193, + "step": 8565 + }, + { + "epoch": 0.66, + "grad_norm": 1.2290367876821564, + "learning_rate": 5.3439567158546865e-06, + "loss": 0.5916, + "step": 8566 + }, + { + "epoch": 0.66, + "grad_norm": 1.1429654500083193, + "learning_rate": 5.341733166540372e-06, + "loss": 0.5337, + "step": 8567 + }, + { + "epoch": 0.66, + "grad_norm": 1.3126618875846319, + "learning_rate": 5.339509911326475e-06, + "loss": 0.5541, + "step": 8568 + }, + { + "epoch": 0.66, + "grad_norm": 1.2042186274861924, + "learning_rate": 5.337286950353366e-06, + "loss": 0.5425, + "step": 8569 + }, + { + "epoch": 0.66, + "grad_norm": 1.1255786255429039, + "learning_rate": 5.3350642837613845e-06, + "loss": 0.555, + "step": 8570 + }, + { + "epoch": 0.66, + "grad_norm": 1.1246557238754393, + "learning_rate": 5.332841911690867e-06, + "loss": 0.5074, + "step": 8571 + }, + { + "epoch": 0.66, + "grad_norm": 1.1832620892546886, + "learning_rate": 5.330619834282116e-06, + "loss": 0.5007, + "step": 8572 + }, + { + "epoch": 0.67, + "grad_norm": 1.231502183755045, + "learning_rate": 5.328398051675423e-06, + "loss": 0.5083, + "step": 8573 + }, + { + "epoch": 0.67, + "grad_norm": 1.100667711823135, + "learning_rate": 5.32617656401107e-06, + "loss": 0.454, + "step": 8574 + }, + { + "epoch": 0.67, + "grad_norm": 1.0903712264317869, + "learning_rate": 5.323955371429299e-06, + "loss": 0.4832, + "step": 8575 + }, + { + "epoch": 0.67, + "grad_norm": 1.1942476004420102, + "learning_rate": 5.32173447407035e-06, + "loss": 0.5588, + "step": 8576 + }, + { + "epoch": 0.67, + "grad_norm": 1.232772704667899, + "learning_rate": 5.319513872074442e-06, + "loss": 0.5315, + "step": 8577 + }, + { + "epoch": 0.67, + "grad_norm": 1.1489232910751808, + "learning_rate": 5.31729356558177e-06, + "loss": 0.4915, + "step": 8578 + }, + { + "epoch": 0.67, + "grad_norm": 1.219642165893346, + "learning_rate": 5.31507355473252e-06, + "loss": 0.534, + "step": 8579 + }, + { + "epoch": 0.67, + "grad_norm": 1.1251108856655108, + "learning_rate": 5.312853839666843e-06, + "loss": 0.4948, + "step": 8580 + }, + { + "epoch": 0.67, + "grad_norm": 1.256947047873108, + "learning_rate": 5.31063442052489e-06, + "loss": 0.5913, + "step": 8581 + }, + { + "epoch": 0.67, + "grad_norm": 1.2139167806203217, + "learning_rate": 5.308415297446774e-06, + "loss": 0.5521, + "step": 8582 + }, + { + "epoch": 0.67, + "grad_norm": 1.0642537061444397, + "learning_rate": 5.306196470572606e-06, + "loss": 0.4871, + "step": 8583 + }, + { + "epoch": 0.67, + "grad_norm": 1.1650686332230558, + "learning_rate": 5.303977940042477e-06, + "loss": 0.5077, + "step": 8584 + }, + { + "epoch": 0.67, + "grad_norm": 1.2050793120689047, + "learning_rate": 5.301759705996446e-06, + "loss": 0.5008, + "step": 8585 + }, + { + "epoch": 0.67, + "grad_norm": 1.1789422112523302, + "learning_rate": 5.299541768574563e-06, + "loss": 0.5113, + "step": 8586 + }, + { + "epoch": 0.67, + "grad_norm": 1.0785134762774005, + "learning_rate": 5.297324127916858e-06, + "loss": 0.5204, + "step": 8587 + }, + { + "epoch": 0.67, + "grad_norm": 1.1194235026438761, + "learning_rate": 5.2951067841633465e-06, + "loss": 0.4781, + "step": 8588 + }, + { + "epoch": 0.67, + "grad_norm": 1.1258937146823271, + "learning_rate": 5.292889737454019e-06, + "loss": 0.497, + "step": 8589 + }, + { + "epoch": 0.67, + "grad_norm": 1.1190385755310723, + "learning_rate": 5.29067298792885e-06, + "loss": 0.5278, + "step": 8590 + }, + { + "epoch": 0.67, + "grad_norm": 1.2700620979235986, + "learning_rate": 5.288456535727786e-06, + "loss": 0.4954, + "step": 8591 + }, + { + "epoch": 0.67, + "grad_norm": 1.1660641351829342, + "learning_rate": 5.286240380990772e-06, + "loss": 0.5957, + "step": 8592 + }, + { + "epoch": 0.67, + "grad_norm": 1.1979560790281831, + "learning_rate": 5.284024523857721e-06, + "loss": 0.5393, + "step": 8593 + }, + { + "epoch": 0.67, + "grad_norm": 1.2155599660074705, + "learning_rate": 5.281808964468534e-06, + "loss": 0.5384, + "step": 8594 + }, + { + "epoch": 0.67, + "grad_norm": 1.1032186733166185, + "learning_rate": 5.2795937029630905e-06, + "loss": 0.5244, + "step": 8595 + }, + { + "epoch": 0.67, + "grad_norm": 1.1402868004639521, + "learning_rate": 5.277378739481249e-06, + "loss": 0.4849, + "step": 8596 + }, + { + "epoch": 0.67, + "grad_norm": 1.1043772886432794, + "learning_rate": 5.275164074162854e-06, + "loss": 0.4652, + "step": 8597 + }, + { + "epoch": 0.67, + "grad_norm": 1.1901302320879987, + "learning_rate": 5.2729497071477276e-06, + "loss": 0.4694, + "step": 8598 + }, + { + "epoch": 0.67, + "grad_norm": 1.1750360706555412, + "learning_rate": 5.270735638575677e-06, + "loss": 0.5124, + "step": 8599 + }, + { + "epoch": 0.67, + "grad_norm": 1.1766057972837987, + "learning_rate": 5.268521868586487e-06, + "loss": 0.5406, + "step": 8600 + }, + { + "epoch": 0.67, + "grad_norm": 1.2372698579506118, + "learning_rate": 5.266308397319918e-06, + "loss": 0.5525, + "step": 8601 + }, + { + "epoch": 0.67, + "grad_norm": 1.219008198310753, + "learning_rate": 5.264095224915722e-06, + "loss": 0.5339, + "step": 8602 + }, + { + "epoch": 0.67, + "grad_norm": 1.2359080391903958, + "learning_rate": 5.261882351513629e-06, + "loss": 0.4405, + "step": 8603 + }, + { + "epoch": 0.67, + "grad_norm": 1.2587329032214187, + "learning_rate": 5.259669777253349e-06, + "loss": 0.5463, + "step": 8604 + }, + { + "epoch": 0.67, + "grad_norm": 1.1094577382356248, + "learning_rate": 5.257457502274577e-06, + "loss": 0.4718, + "step": 8605 + }, + { + "epoch": 0.67, + "grad_norm": 1.1723870239649903, + "learning_rate": 5.255245526716976e-06, + "loss": 0.5151, + "step": 8606 + }, + { + "epoch": 0.67, + "grad_norm": 1.1580931947461246, + "learning_rate": 5.253033850720206e-06, + "loss": 0.5603, + "step": 8607 + }, + { + "epoch": 0.67, + "grad_norm": 1.2049124184908713, + "learning_rate": 5.2508224744239e-06, + "loss": 0.5852, + "step": 8608 + }, + { + "epoch": 0.67, + "grad_norm": 1.172163253936146, + "learning_rate": 5.2486113979676765e-06, + "loss": 0.5342, + "step": 8609 + }, + { + "epoch": 0.67, + "grad_norm": 1.23633598337192, + "learning_rate": 5.24640062149113e-06, + "loss": 0.5674, + "step": 8610 + }, + { + "epoch": 0.67, + "grad_norm": 1.15119312837603, + "learning_rate": 5.244190145133834e-06, + "loss": 0.4746, + "step": 8611 + }, + { + "epoch": 0.67, + "grad_norm": 1.1033367180748352, + "learning_rate": 5.241979969035351e-06, + "loss": 0.495, + "step": 8612 + }, + { + "epoch": 0.67, + "grad_norm": 1.1158579671266355, + "learning_rate": 5.239770093335219e-06, + "loss": 0.5072, + "step": 8613 + }, + { + "epoch": 0.67, + "grad_norm": 1.1623490799593086, + "learning_rate": 5.237560518172963e-06, + "loss": 0.5323, + "step": 8614 + }, + { + "epoch": 0.67, + "grad_norm": 1.1601295339674342, + "learning_rate": 5.235351243688085e-06, + "loss": 0.5236, + "step": 8615 + }, + { + "epoch": 0.67, + "grad_norm": 1.1915982951488509, + "learning_rate": 5.233142270020062e-06, + "loss": 0.5056, + "step": 8616 + }, + { + "epoch": 0.67, + "grad_norm": 1.1061701621864002, + "learning_rate": 5.230933597308361e-06, + "loss": 0.4866, + "step": 8617 + }, + { + "epoch": 0.67, + "grad_norm": 1.1430898710414628, + "learning_rate": 5.228725225692426e-06, + "loss": 0.46, + "step": 8618 + }, + { + "epoch": 0.67, + "grad_norm": 1.0734983490823276, + "learning_rate": 5.22651715531169e-06, + "loss": 0.4287, + "step": 8619 + }, + { + "epoch": 0.67, + "grad_norm": 1.2872246327407593, + "learning_rate": 5.224309386305553e-06, + "loss": 0.5998, + "step": 8620 + }, + { + "epoch": 0.67, + "grad_norm": 1.129060992026886, + "learning_rate": 5.222101918813399e-06, + "loss": 0.5322, + "step": 8621 + }, + { + "epoch": 0.67, + "grad_norm": 1.2066310478366338, + "learning_rate": 5.219894752974602e-06, + "loss": 0.5578, + "step": 8622 + }, + { + "epoch": 0.67, + "grad_norm": 1.1879721756865282, + "learning_rate": 5.217687888928512e-06, + "loss": 0.5333, + "step": 8623 + }, + { + "epoch": 0.67, + "grad_norm": 1.119153993004121, + "learning_rate": 5.215481326814459e-06, + "loss": 0.5662, + "step": 8624 + }, + { + "epoch": 0.67, + "grad_norm": 1.029380433022499, + "learning_rate": 5.213275066771759e-06, + "loss": 0.4834, + "step": 8625 + }, + { + "epoch": 0.67, + "grad_norm": 1.138691295111383, + "learning_rate": 5.211069108939695e-06, + "loss": 0.5219, + "step": 8626 + }, + { + "epoch": 0.67, + "grad_norm": 1.1652682927023097, + "learning_rate": 5.2088634534575465e-06, + "loss": 0.5716, + "step": 8627 + }, + { + "epoch": 0.67, + "grad_norm": 1.18220100671412, + "learning_rate": 5.206658100464568e-06, + "loss": 0.5451, + "step": 8628 + }, + { + "epoch": 0.67, + "grad_norm": 1.170097223301615, + "learning_rate": 5.204453050099997e-06, + "loss": 0.5406, + "step": 8629 + }, + { + "epoch": 0.67, + "grad_norm": 1.113602280758709, + "learning_rate": 5.202248302503047e-06, + "loss": 0.4961, + "step": 8630 + }, + { + "epoch": 0.67, + "grad_norm": 1.3095343099107075, + "learning_rate": 5.200043857812911e-06, + "loss": 0.5421, + "step": 8631 + }, + { + "epoch": 0.67, + "grad_norm": 1.2304595946925025, + "learning_rate": 5.19783971616877e-06, + "loss": 0.5503, + "step": 8632 + }, + { + "epoch": 0.67, + "grad_norm": 1.163510902337313, + "learning_rate": 5.195635877709783e-06, + "loss": 0.4635, + "step": 8633 + }, + { + "epoch": 0.67, + "grad_norm": 1.0673613690092136, + "learning_rate": 5.193432342575093e-06, + "loss": 0.4947, + "step": 8634 + }, + { + "epoch": 0.67, + "grad_norm": 1.0948594597110899, + "learning_rate": 5.191229110903819e-06, + "loss": 0.5048, + "step": 8635 + }, + { + "epoch": 0.67, + "grad_norm": 1.1360119591909523, + "learning_rate": 5.189026182835059e-06, + "loss": 0.5391, + "step": 8636 + }, + { + "epoch": 0.67, + "grad_norm": 1.129882073958542, + "learning_rate": 5.186823558507897e-06, + "loss": 0.5073, + "step": 8637 + }, + { + "epoch": 0.67, + "grad_norm": 1.1633678642292653, + "learning_rate": 5.184621238061397e-06, + "loss": 0.5447, + "step": 8638 + }, + { + "epoch": 0.67, + "grad_norm": 1.1809202410084578, + "learning_rate": 5.182419221634605e-06, + "loss": 0.5435, + "step": 8639 + }, + { + "epoch": 0.67, + "grad_norm": 1.160708671598167, + "learning_rate": 5.180217509366544e-06, + "loss": 0.5088, + "step": 8640 + }, + { + "epoch": 0.67, + "grad_norm": 1.1131224401874014, + "learning_rate": 5.178016101396215e-06, + "loss": 0.5037, + "step": 8641 + }, + { + "epoch": 0.67, + "grad_norm": 1.2057839773200891, + "learning_rate": 5.175814997862606e-06, + "loss": 0.5706, + "step": 8642 + }, + { + "epoch": 0.67, + "grad_norm": 1.1948588888097982, + "learning_rate": 5.173614198904686e-06, + "loss": 0.5504, + "step": 8643 + }, + { + "epoch": 0.67, + "grad_norm": 1.1293594876075312, + "learning_rate": 5.171413704661403e-06, + "loss": 0.5333, + "step": 8644 + }, + { + "epoch": 0.67, + "grad_norm": 1.2648609822296937, + "learning_rate": 5.169213515271686e-06, + "loss": 0.5791, + "step": 8645 + }, + { + "epoch": 0.67, + "grad_norm": 1.088862315624611, + "learning_rate": 5.167013630874447e-06, + "loss": 0.4896, + "step": 8646 + }, + { + "epoch": 0.67, + "grad_norm": 1.2043051938076799, + "learning_rate": 5.164814051608567e-06, + "loss": 0.5396, + "step": 8647 + }, + { + "epoch": 0.67, + "grad_norm": 1.2169972556309723, + "learning_rate": 5.162614777612924e-06, + "loss": 0.5411, + "step": 8648 + }, + { + "epoch": 0.67, + "grad_norm": 1.1343408731839044, + "learning_rate": 5.16041580902637e-06, + "loss": 0.547, + "step": 8649 + }, + { + "epoch": 0.67, + "grad_norm": 1.1817151258958554, + "learning_rate": 5.158217145987732e-06, + "loss": 0.5318, + "step": 8650 + }, + { + "epoch": 0.67, + "grad_norm": 1.097955574682945, + "learning_rate": 5.156018788635831e-06, + "loss": 0.5265, + "step": 8651 + }, + { + "epoch": 0.67, + "grad_norm": 1.198500278226194, + "learning_rate": 5.153820737109449e-06, + "loss": 0.519, + "step": 8652 + }, + { + "epoch": 0.67, + "grad_norm": 1.212108120070904, + "learning_rate": 5.151622991547368e-06, + "loss": 0.5715, + "step": 8653 + }, + { + "epoch": 0.67, + "grad_norm": 1.1006655456942376, + "learning_rate": 5.149425552088342e-06, + "loss": 0.4883, + "step": 8654 + }, + { + "epoch": 0.67, + "grad_norm": 1.0808006477462726, + "learning_rate": 5.1472284188711065e-06, + "loss": 0.497, + "step": 8655 + }, + { + "epoch": 0.67, + "grad_norm": 1.151524553230149, + "learning_rate": 5.145031592034382e-06, + "loss": 0.5578, + "step": 8656 + }, + { + "epoch": 0.67, + "grad_norm": 1.1521271582897061, + "learning_rate": 5.1428350717168575e-06, + "loss": 0.4992, + "step": 8657 + }, + { + "epoch": 0.67, + "grad_norm": 1.1410046820816142, + "learning_rate": 5.140638858057214e-06, + "loss": 0.5163, + "step": 8658 + }, + { + "epoch": 0.67, + "grad_norm": 1.2454646803703624, + "learning_rate": 5.138442951194115e-06, + "loss": 0.5194, + "step": 8659 + }, + { + "epoch": 0.67, + "grad_norm": 1.1634886178286548, + "learning_rate": 5.136247351266191e-06, + "loss": 0.4653, + "step": 8660 + }, + { + "epoch": 0.67, + "grad_norm": 1.1077306138780623, + "learning_rate": 5.134052058412069e-06, + "loss": 0.4784, + "step": 8661 + }, + { + "epoch": 0.67, + "grad_norm": 1.219274505822627, + "learning_rate": 5.131857072770341e-06, + "loss": 0.5251, + "step": 8662 + }, + { + "epoch": 0.67, + "grad_norm": 1.1778286568780172, + "learning_rate": 5.129662394479593e-06, + "loss": 0.5141, + "step": 8663 + }, + { + "epoch": 0.67, + "grad_norm": 1.2318246304120248, + "learning_rate": 5.1274680236783855e-06, + "loss": 0.5268, + "step": 8664 + }, + { + "epoch": 0.67, + "grad_norm": 1.0200855604451335, + "learning_rate": 5.125273960505261e-06, + "loss": 0.4815, + "step": 8665 + }, + { + "epoch": 0.67, + "grad_norm": 1.1186225584103466, + "learning_rate": 5.123080205098745e-06, + "loss": 0.5091, + "step": 8666 + }, + { + "epoch": 0.67, + "grad_norm": 1.196980229226421, + "learning_rate": 5.120886757597334e-06, + "loss": 0.5641, + "step": 8667 + }, + { + "epoch": 0.67, + "grad_norm": 1.0676452928246292, + "learning_rate": 5.118693618139514e-06, + "loss": 0.4918, + "step": 8668 + }, + { + "epoch": 0.67, + "grad_norm": 1.1282094633231983, + "learning_rate": 5.116500786863755e-06, + "loss": 0.5331, + "step": 8669 + }, + { + "epoch": 0.67, + "grad_norm": 1.1163498198847663, + "learning_rate": 5.114308263908493e-06, + "loss": 0.5015, + "step": 8670 + }, + { + "epoch": 0.67, + "grad_norm": 1.2569282219239768, + "learning_rate": 5.11211604941216e-06, + "loss": 0.5146, + "step": 8671 + }, + { + "epoch": 0.67, + "grad_norm": 1.024830459901714, + "learning_rate": 5.109924143513156e-06, + "loss": 0.4596, + "step": 8672 + }, + { + "epoch": 0.67, + "grad_norm": 1.1936340515387955, + "learning_rate": 5.107732546349871e-06, + "loss": 0.5151, + "step": 8673 + }, + { + "epoch": 0.67, + "grad_norm": 1.217341415996466, + "learning_rate": 5.105541258060669e-06, + "loss": 0.5312, + "step": 8674 + }, + { + "epoch": 0.67, + "grad_norm": 1.1673112121957276, + "learning_rate": 5.103350278783901e-06, + "loss": 0.5564, + "step": 8675 + }, + { + "epoch": 0.67, + "grad_norm": 1.2313955056436072, + "learning_rate": 5.101159608657896e-06, + "loss": 0.4974, + "step": 8676 + }, + { + "epoch": 0.67, + "grad_norm": 1.094801206876865, + "learning_rate": 5.098969247820958e-06, + "loss": 0.5384, + "step": 8677 + }, + { + "epoch": 0.67, + "grad_norm": 1.2621264669948749, + "learning_rate": 5.096779196411375e-06, + "loss": 0.5435, + "step": 8678 + }, + { + "epoch": 0.67, + "grad_norm": 1.1854095630846337, + "learning_rate": 5.094589454567423e-06, + "loss": 0.4999, + "step": 8679 + }, + { + "epoch": 0.67, + "grad_norm": 1.2592366845443836, + "learning_rate": 5.092400022427344e-06, + "loss": 0.5658, + "step": 8680 + }, + { + "epoch": 0.67, + "grad_norm": 1.1612638073973949, + "learning_rate": 5.090210900129375e-06, + "loss": 0.4336, + "step": 8681 + }, + { + "epoch": 0.67, + "grad_norm": 1.1419678713025245, + "learning_rate": 5.088022087811719e-06, + "loss": 0.5373, + "step": 8682 + }, + { + "epoch": 0.67, + "grad_norm": 1.1215963267244438, + "learning_rate": 5.0858335856125715e-06, + "loss": 0.4958, + "step": 8683 + }, + { + "epoch": 0.67, + "grad_norm": 1.0964654728558387, + "learning_rate": 5.083645393670103e-06, + "loss": 0.4404, + "step": 8684 + }, + { + "epoch": 0.67, + "grad_norm": 1.107117626236695, + "learning_rate": 5.081457512122466e-06, + "loss": 0.4801, + "step": 8685 + }, + { + "epoch": 0.67, + "grad_norm": 1.1830450114170197, + "learning_rate": 5.0792699411077976e-06, + "loss": 0.4896, + "step": 8686 + }, + { + "epoch": 0.67, + "grad_norm": 1.2074752040552232, + "learning_rate": 5.077082680764201e-06, + "loss": 0.5487, + "step": 8687 + }, + { + "epoch": 0.67, + "grad_norm": 1.2304433668636838, + "learning_rate": 5.074895731229772e-06, + "loss": 0.5596, + "step": 8688 + }, + { + "epoch": 0.67, + "grad_norm": 1.1889513582878708, + "learning_rate": 5.0727090926425915e-06, + "loss": 0.5316, + "step": 8689 + }, + { + "epoch": 0.67, + "grad_norm": 1.333391540965462, + "learning_rate": 5.070522765140703e-06, + "loss": 0.6063, + "step": 8690 + }, + { + "epoch": 0.67, + "grad_norm": 1.2302187711104398, + "learning_rate": 5.068336748862148e-06, + "loss": 0.5517, + "step": 8691 + }, + { + "epoch": 0.67, + "grad_norm": 1.15457893449672, + "learning_rate": 5.066151043944936e-06, + "loss": 0.5197, + "step": 8692 + }, + { + "epoch": 0.67, + "grad_norm": 1.3483178767499935, + "learning_rate": 5.063965650527063e-06, + "loss": 0.4857, + "step": 8693 + }, + { + "epoch": 0.67, + "grad_norm": 1.1334459835520658, + "learning_rate": 5.0617805687465064e-06, + "loss": 0.5382, + "step": 8694 + }, + { + "epoch": 0.67, + "grad_norm": 1.3725335801816276, + "learning_rate": 5.059595798741218e-06, + "loss": 0.5671, + "step": 8695 + }, + { + "epoch": 0.67, + "grad_norm": 1.2457734177555044, + "learning_rate": 5.0574113406491365e-06, + "loss": 0.5113, + "step": 8696 + }, + { + "epoch": 0.67, + "grad_norm": 1.216837531768351, + "learning_rate": 5.055227194608183e-06, + "loss": 0.5538, + "step": 8697 + }, + { + "epoch": 0.67, + "grad_norm": 1.1249549644780816, + "learning_rate": 5.053043360756247e-06, + "loss": 0.5618, + "step": 8698 + }, + { + "epoch": 0.67, + "grad_norm": 1.2482107708552286, + "learning_rate": 5.050859839231203e-06, + "loss": 0.4982, + "step": 8699 + }, + { + "epoch": 0.67, + "grad_norm": 1.2062114412934317, + "learning_rate": 5.0486766301709115e-06, + "loss": 0.5312, + "step": 8700 + }, + { + "epoch": 0.68, + "grad_norm": 1.1889998851837025, + "learning_rate": 5.046493733713209e-06, + "loss": 0.5058, + "step": 8701 + }, + { + "epoch": 0.68, + "grad_norm": 1.2844974951328014, + "learning_rate": 5.04431114999592e-06, + "loss": 0.5222, + "step": 8702 + }, + { + "epoch": 0.68, + "grad_norm": 1.2528252621288747, + "learning_rate": 5.0421288791568305e-06, + "loss": 0.5214, + "step": 8703 + }, + { + "epoch": 0.68, + "grad_norm": 1.1496130188531966, + "learning_rate": 5.0399469213337234e-06, + "loss": 0.5005, + "step": 8704 + }, + { + "epoch": 0.68, + "grad_norm": 1.2741424855810366, + "learning_rate": 5.037765276664359e-06, + "loss": 0.5238, + "step": 8705 + }, + { + "epoch": 0.68, + "grad_norm": 1.1740731032874818, + "learning_rate": 5.035583945286474e-06, + "loss": 0.5199, + "step": 8706 + }, + { + "epoch": 0.68, + "grad_norm": 1.2949348552622904, + "learning_rate": 5.033402927337792e-06, + "loss": 0.5813, + "step": 8707 + }, + { + "epoch": 0.68, + "grad_norm": 1.2079318738920737, + "learning_rate": 5.031222222956007e-06, + "loss": 0.5605, + "step": 8708 + }, + { + "epoch": 0.68, + "grad_norm": 1.186743746425818, + "learning_rate": 5.029041832278794e-06, + "loss": 0.5521, + "step": 8709 + }, + { + "epoch": 0.68, + "grad_norm": 1.1617586520229715, + "learning_rate": 5.026861755443817e-06, + "loss": 0.4885, + "step": 8710 + }, + { + "epoch": 0.68, + "grad_norm": 1.225214028637384, + "learning_rate": 5.024681992588717e-06, + "loss": 0.535, + "step": 8711 + }, + { + "epoch": 0.68, + "grad_norm": 1.0467486518794695, + "learning_rate": 5.022502543851116e-06, + "loss": 0.49, + "step": 8712 + }, + { + "epoch": 0.68, + "grad_norm": 1.1716284937674728, + "learning_rate": 5.020323409368604e-06, + "loss": 0.5247, + "step": 8713 + }, + { + "epoch": 0.68, + "grad_norm": 1.105648390121989, + "learning_rate": 5.018144589278768e-06, + "loss": 0.4994, + "step": 8714 + }, + { + "epoch": 0.68, + "grad_norm": 1.0612254070941622, + "learning_rate": 5.015966083719166e-06, + "loss": 0.4692, + "step": 8715 + }, + { + "epoch": 0.68, + "grad_norm": 1.2910441108365716, + "learning_rate": 5.013787892827341e-06, + "loss": 0.5977, + "step": 8716 + }, + { + "epoch": 0.68, + "grad_norm": 1.1590311779604443, + "learning_rate": 5.011610016740815e-06, + "loss": 0.5083, + "step": 8717 + }, + { + "epoch": 0.68, + "grad_norm": 1.2803542564832306, + "learning_rate": 5.009432455597085e-06, + "loss": 0.5506, + "step": 8718 + }, + { + "epoch": 0.68, + "grad_norm": 1.31318151854397, + "learning_rate": 5.007255209533629e-06, + "loss": 0.5438, + "step": 8719 + }, + { + "epoch": 0.68, + "grad_norm": 1.226707692876925, + "learning_rate": 5.00507827868791e-06, + "loss": 0.5622, + "step": 8720 + }, + { + "epoch": 0.68, + "grad_norm": 1.1978867181544943, + "learning_rate": 5.00290166319737e-06, + "loss": 0.5242, + "step": 8721 + }, + { + "epoch": 0.68, + "grad_norm": 1.1723747205196022, + "learning_rate": 5.000725363199433e-06, + "loss": 0.4761, + "step": 8722 + }, + { + "epoch": 0.68, + "grad_norm": 1.1578956955068445, + "learning_rate": 4.998549378831494e-06, + "loss": 0.556, + "step": 8723 + }, + { + "epoch": 0.68, + "grad_norm": 1.1441949864656908, + "learning_rate": 4.996373710230937e-06, + "loss": 0.4635, + "step": 8724 + }, + { + "epoch": 0.68, + "grad_norm": 1.087636209047633, + "learning_rate": 4.994198357535122e-06, + "loss": 0.463, + "step": 8725 + }, + { + "epoch": 0.68, + "grad_norm": 1.1115647317189916, + "learning_rate": 4.992023320881391e-06, + "loss": 0.5276, + "step": 8726 + }, + { + "epoch": 0.68, + "grad_norm": 1.1090953098236815, + "learning_rate": 4.989848600407069e-06, + "loss": 0.5932, + "step": 8727 + }, + { + "epoch": 0.68, + "grad_norm": 1.2186306626018797, + "learning_rate": 4.987674196249454e-06, + "loss": 0.5357, + "step": 8728 + }, + { + "epoch": 0.68, + "grad_norm": 1.113205649502695, + "learning_rate": 4.9855001085458246e-06, + "loss": 0.5084, + "step": 8729 + }, + { + "epoch": 0.68, + "grad_norm": 1.35781143416702, + "learning_rate": 4.983326337433444e-06, + "loss": 0.5489, + "step": 8730 + }, + { + "epoch": 0.68, + "grad_norm": 1.1932374482996237, + "learning_rate": 4.981152883049555e-06, + "loss": 0.557, + "step": 8731 + }, + { + "epoch": 0.68, + "grad_norm": 1.2348166532752183, + "learning_rate": 4.978979745531382e-06, + "loss": 0.5066, + "step": 8732 + }, + { + "epoch": 0.68, + "grad_norm": 1.163448453406435, + "learning_rate": 4.976806925016117e-06, + "loss": 0.5448, + "step": 8733 + }, + { + "epoch": 0.68, + "grad_norm": 1.245600827102656, + "learning_rate": 4.974634421640949e-06, + "loss": 0.5817, + "step": 8734 + }, + { + "epoch": 0.68, + "grad_norm": 1.2253249905063204, + "learning_rate": 4.972462235543038e-06, + "loss": 0.5275, + "step": 8735 + }, + { + "epoch": 0.68, + "grad_norm": 1.2104947665171129, + "learning_rate": 4.970290366859523e-06, + "loss": 0.5017, + "step": 8736 + }, + { + "epoch": 0.68, + "grad_norm": 1.1282664667335658, + "learning_rate": 4.968118815727532e-06, + "loss": 0.521, + "step": 8737 + }, + { + "epoch": 0.68, + "grad_norm": 1.1283011217088237, + "learning_rate": 4.965947582284161e-06, + "loss": 0.4936, + "step": 8738 + }, + { + "epoch": 0.68, + "grad_norm": 1.1676207569413988, + "learning_rate": 4.963776666666487e-06, + "loss": 0.5043, + "step": 8739 + }, + { + "epoch": 0.68, + "grad_norm": 1.193596249810098, + "learning_rate": 4.961606069011576e-06, + "loss": 0.4662, + "step": 8740 + }, + { + "epoch": 0.68, + "grad_norm": 1.233221166682347, + "learning_rate": 4.959435789456468e-06, + "loss": 0.5592, + "step": 8741 + }, + { + "epoch": 0.68, + "grad_norm": 1.144769689310219, + "learning_rate": 4.957265828138189e-06, + "loss": 0.5146, + "step": 8742 + }, + { + "epoch": 0.68, + "grad_norm": 1.1987768595434038, + "learning_rate": 4.955096185193732e-06, + "loss": 0.5294, + "step": 8743 + }, + { + "epoch": 0.68, + "grad_norm": 1.2877682286081171, + "learning_rate": 4.95292686076008e-06, + "loss": 0.5436, + "step": 8744 + }, + { + "epoch": 0.68, + "grad_norm": 1.2587809654904685, + "learning_rate": 4.950757854974195e-06, + "loss": 0.5423, + "step": 8745 + }, + { + "epoch": 0.68, + "grad_norm": 1.2094043501414709, + "learning_rate": 4.948589167973018e-06, + "loss": 0.5208, + "step": 8746 + }, + { + "epoch": 0.68, + "grad_norm": 1.4005826555053236, + "learning_rate": 4.946420799893472e-06, + "loss": 0.489, + "step": 8747 + }, + { + "epoch": 0.68, + "grad_norm": 1.0945550408184055, + "learning_rate": 4.944252750872455e-06, + "loss": 0.5158, + "step": 8748 + }, + { + "epoch": 0.68, + "grad_norm": 1.208639758269382, + "learning_rate": 4.94208502104684e-06, + "loss": 0.5094, + "step": 8749 + }, + { + "epoch": 0.68, + "grad_norm": 1.2441755496502385, + "learning_rate": 4.9399176105534954e-06, + "loss": 0.5618, + "step": 8750 + }, + { + "epoch": 0.68, + "grad_norm": 1.1204868494885198, + "learning_rate": 4.937750519529258e-06, + "loss": 0.4654, + "step": 8751 + }, + { + "epoch": 0.68, + "grad_norm": 1.0849894494980843, + "learning_rate": 4.935583748110947e-06, + "loss": 0.484, + "step": 8752 + }, + { + "epoch": 0.68, + "grad_norm": 1.1989675749625017, + "learning_rate": 4.933417296435367e-06, + "loss": 0.5035, + "step": 8753 + }, + { + "epoch": 0.68, + "grad_norm": 1.1804838208613058, + "learning_rate": 4.931251164639289e-06, + "loss": 0.4876, + "step": 8754 + }, + { + "epoch": 0.68, + "grad_norm": 1.2706302522249588, + "learning_rate": 4.929085352859478e-06, + "loss": 0.5765, + "step": 8755 + }, + { + "epoch": 0.68, + "grad_norm": 1.110206238858068, + "learning_rate": 4.92691986123267e-06, + "loss": 0.4961, + "step": 8756 + }, + { + "epoch": 0.68, + "grad_norm": 1.1098562459391241, + "learning_rate": 4.924754689895589e-06, + "loss": 0.4791, + "step": 8757 + }, + { + "epoch": 0.68, + "grad_norm": 1.1283116342119364, + "learning_rate": 4.922589838984929e-06, + "loss": 0.5065, + "step": 8758 + }, + { + "epoch": 0.68, + "grad_norm": 1.198320530799597, + "learning_rate": 4.920425308637365e-06, + "loss": 0.5839, + "step": 8759 + }, + { + "epoch": 0.68, + "grad_norm": 1.2064124437757886, + "learning_rate": 4.918261098989557e-06, + "loss": 0.5306, + "step": 8760 + }, + { + "epoch": 0.68, + "grad_norm": 1.1800092548880279, + "learning_rate": 4.9160972101781455e-06, + "loss": 0.5327, + "step": 8761 + }, + { + "epoch": 0.68, + "grad_norm": 1.1973206848129216, + "learning_rate": 4.913933642339747e-06, + "loss": 0.522, + "step": 8762 + }, + { + "epoch": 0.68, + "grad_norm": 1.1384549335042284, + "learning_rate": 4.911770395610961e-06, + "loss": 0.4791, + "step": 8763 + }, + { + "epoch": 0.68, + "grad_norm": 1.0463926997794826, + "learning_rate": 4.909607470128358e-06, + "loss": 0.4773, + "step": 8764 + }, + { + "epoch": 0.68, + "grad_norm": 1.2602889519082285, + "learning_rate": 4.907444866028499e-06, + "loss": 0.492, + "step": 8765 + }, + { + "epoch": 0.68, + "grad_norm": 1.1354075731852413, + "learning_rate": 4.90528258344792e-06, + "loss": 0.5507, + "step": 8766 + }, + { + "epoch": 0.68, + "grad_norm": 1.1896138200121311, + "learning_rate": 4.90312062252314e-06, + "loss": 0.5114, + "step": 8767 + }, + { + "epoch": 0.68, + "grad_norm": 1.1859678121319162, + "learning_rate": 4.900958983390651e-06, + "loss": 0.4944, + "step": 8768 + }, + { + "epoch": 0.68, + "grad_norm": 1.3874209922771823, + "learning_rate": 4.898797666186926e-06, + "loss": 0.6106, + "step": 8769 + }, + { + "epoch": 0.68, + "grad_norm": 1.1651552946524932, + "learning_rate": 4.896636671048424e-06, + "loss": 0.514, + "step": 8770 + }, + { + "epoch": 0.68, + "grad_norm": 1.2597598053553805, + "learning_rate": 4.894475998111578e-06, + "loss": 0.4905, + "step": 8771 + }, + { + "epoch": 0.68, + "grad_norm": 1.1090669875969148, + "learning_rate": 4.892315647512802e-06, + "loss": 0.4978, + "step": 8772 + }, + { + "epoch": 0.68, + "grad_norm": 1.3318073610327208, + "learning_rate": 4.890155619388497e-06, + "loss": 0.5835, + "step": 8773 + }, + { + "epoch": 0.68, + "grad_norm": 1.1785955364684588, + "learning_rate": 4.887995913875025e-06, + "loss": 0.5015, + "step": 8774 + }, + { + "epoch": 0.68, + "grad_norm": 1.270471594350697, + "learning_rate": 4.8858365311087475e-06, + "loss": 0.5193, + "step": 8775 + }, + { + "epoch": 0.68, + "grad_norm": 1.1723017614858855, + "learning_rate": 4.883677471225995e-06, + "loss": 0.5309, + "step": 8776 + }, + { + "epoch": 0.68, + "grad_norm": 1.0159863342732784, + "learning_rate": 4.881518734363084e-06, + "loss": 0.4668, + "step": 8777 + }, + { + "epoch": 0.68, + "grad_norm": 1.1852034396569366, + "learning_rate": 4.8793603206563034e-06, + "loss": 0.5226, + "step": 8778 + }, + { + "epoch": 0.68, + "grad_norm": 1.1410620387744448, + "learning_rate": 4.8772022302419206e-06, + "loss": 0.5536, + "step": 8779 + }, + { + "epoch": 0.68, + "grad_norm": 1.0879014732052814, + "learning_rate": 4.875044463256192e-06, + "loss": 0.5078, + "step": 8780 + }, + { + "epoch": 0.68, + "grad_norm": 1.1385734083678445, + "learning_rate": 4.872887019835347e-06, + "loss": 0.5632, + "step": 8781 + }, + { + "epoch": 0.68, + "grad_norm": 1.1599100795785098, + "learning_rate": 4.870729900115597e-06, + "loss": 0.5211, + "step": 8782 + }, + { + "epoch": 0.68, + "grad_norm": 1.1990562602246906, + "learning_rate": 4.868573104233137e-06, + "loss": 0.4641, + "step": 8783 + }, + { + "epoch": 0.68, + "grad_norm": 1.1820757608548764, + "learning_rate": 4.8664166323241255e-06, + "loss": 0.5194, + "step": 8784 + }, + { + "epoch": 0.68, + "grad_norm": 1.151315676610267, + "learning_rate": 4.864260484524719e-06, + "loss": 0.5376, + "step": 8785 + }, + { + "epoch": 0.68, + "grad_norm": 1.2034011127653805, + "learning_rate": 4.862104660971045e-06, + "loss": 0.4728, + "step": 8786 + }, + { + "epoch": 0.68, + "grad_norm": 1.233055375111033, + "learning_rate": 4.859949161799216e-06, + "loss": 0.5638, + "step": 8787 + }, + { + "epoch": 0.68, + "grad_norm": 1.1503347179253807, + "learning_rate": 4.857793987145315e-06, + "loss": 0.5237, + "step": 8788 + }, + { + "epoch": 0.68, + "grad_norm": 1.191761251621735, + "learning_rate": 4.855639137145406e-06, + "loss": 0.5091, + "step": 8789 + }, + { + "epoch": 0.68, + "grad_norm": 1.1556485648096828, + "learning_rate": 4.853484611935541e-06, + "loss": 0.5012, + "step": 8790 + }, + { + "epoch": 0.68, + "grad_norm": 1.2088060386157224, + "learning_rate": 4.8513304116517435e-06, + "loss": 0.5373, + "step": 8791 + }, + { + "epoch": 0.68, + "grad_norm": 1.1651812815934823, + "learning_rate": 4.8491765364300205e-06, + "loss": 0.5175, + "step": 8792 + }, + { + "epoch": 0.68, + "grad_norm": 1.3091126736814878, + "learning_rate": 4.847022986406362e-06, + "loss": 0.5356, + "step": 8793 + }, + { + "epoch": 0.68, + "grad_norm": 1.1208746293885075, + "learning_rate": 4.844869761716725e-06, + "loss": 0.4709, + "step": 8794 + }, + { + "epoch": 0.68, + "grad_norm": 1.241821523051576, + "learning_rate": 4.842716862497056e-06, + "loss": 0.51, + "step": 8795 + }, + { + "epoch": 0.68, + "grad_norm": 1.1208106025650384, + "learning_rate": 4.840564288883284e-06, + "loss": 0.4849, + "step": 8796 + }, + { + "epoch": 0.68, + "grad_norm": 1.3080502876128315, + "learning_rate": 4.838412041011304e-06, + "loss": 0.571, + "step": 8797 + }, + { + "epoch": 0.68, + "grad_norm": 1.146630402969278, + "learning_rate": 4.836260119017005e-06, + "loss": 0.5836, + "step": 8798 + }, + { + "epoch": 0.68, + "grad_norm": 1.2412763888962388, + "learning_rate": 4.8341085230362425e-06, + "loss": 0.4665, + "step": 8799 + }, + { + "epoch": 0.68, + "grad_norm": 1.1507169976202056, + "learning_rate": 4.831957253204862e-06, + "loss": 0.5157, + "step": 8800 + }, + { + "epoch": 0.68, + "grad_norm": 1.011558549488498, + "learning_rate": 4.829806309658683e-06, + "loss": 0.4807, + "step": 8801 + }, + { + "epoch": 0.68, + "grad_norm": 1.1604659569417641, + "learning_rate": 4.827655692533506e-06, + "loss": 0.4749, + "step": 8802 + }, + { + "epoch": 0.68, + "grad_norm": 1.110122321494262, + "learning_rate": 4.825505401965116e-06, + "loss": 0.4615, + "step": 8803 + }, + { + "epoch": 0.68, + "grad_norm": 1.1882902326877618, + "learning_rate": 4.823355438089262e-06, + "loss": 0.4703, + "step": 8804 + }, + { + "epoch": 0.68, + "grad_norm": 1.2340727810348104, + "learning_rate": 4.821205801041688e-06, + "loss": 0.5237, + "step": 8805 + }, + { + "epoch": 0.68, + "grad_norm": 1.2388076866567943, + "learning_rate": 4.819056490958115e-06, + "loss": 0.514, + "step": 8806 + }, + { + "epoch": 0.68, + "grad_norm": 1.1694750816296748, + "learning_rate": 4.816907507974231e-06, + "loss": 0.5062, + "step": 8807 + }, + { + "epoch": 0.68, + "grad_norm": 1.3178097179547208, + "learning_rate": 4.814758852225717e-06, + "loss": 0.5395, + "step": 8808 + }, + { + "epoch": 0.68, + "grad_norm": 1.3070496516234882, + "learning_rate": 4.8126105238482345e-06, + "loss": 0.5812, + "step": 8809 + }, + { + "epoch": 0.68, + "grad_norm": 1.2633119804688122, + "learning_rate": 4.810462522977408e-06, + "loss": 0.5497, + "step": 8810 + }, + { + "epoch": 0.68, + "grad_norm": 1.1145987762141907, + "learning_rate": 4.808314849748858e-06, + "loss": 0.5404, + "step": 8811 + }, + { + "epoch": 0.68, + "grad_norm": 1.2025891386370484, + "learning_rate": 4.806167504298175e-06, + "loss": 0.5238, + "step": 8812 + }, + { + "epoch": 0.68, + "grad_norm": 1.16903076879311, + "learning_rate": 4.8040204867609355e-06, + "loss": 0.499, + "step": 8813 + }, + { + "epoch": 0.68, + "grad_norm": 1.3988739270357924, + "learning_rate": 4.801873797272694e-06, + "loss": 0.61, + "step": 8814 + }, + { + "epoch": 0.68, + "grad_norm": 1.1930018512203053, + "learning_rate": 4.799727435968975e-06, + "loss": 0.553, + "step": 8815 + }, + { + "epoch": 0.68, + "grad_norm": 1.133625501717757, + "learning_rate": 4.797581402985296e-06, + "loss": 0.4546, + "step": 8816 + }, + { + "epoch": 0.68, + "grad_norm": 1.2365295824552878, + "learning_rate": 4.795435698457141e-06, + "loss": 0.5232, + "step": 8817 + }, + { + "epoch": 0.68, + "grad_norm": 1.20709993728702, + "learning_rate": 4.793290322519981e-06, + "loss": 0.5143, + "step": 8818 + }, + { + "epoch": 0.68, + "grad_norm": 1.108837857297423, + "learning_rate": 4.791145275309271e-06, + "loss": 0.5246, + "step": 8819 + }, + { + "epoch": 0.68, + "grad_norm": 1.1630115757511255, + "learning_rate": 4.7890005569604305e-06, + "loss": 0.4973, + "step": 8820 + }, + { + "epoch": 0.68, + "grad_norm": 1.2096021118616156, + "learning_rate": 4.786856167608869e-06, + "loss": 0.4928, + "step": 8821 + }, + { + "epoch": 0.68, + "grad_norm": 1.292562830152313, + "learning_rate": 4.784712107389975e-06, + "loss": 0.523, + "step": 8822 + }, + { + "epoch": 0.68, + "grad_norm": 1.1632057467745005, + "learning_rate": 4.7825683764391114e-06, + "loss": 0.526, + "step": 8823 + }, + { + "epoch": 0.68, + "grad_norm": 1.175446877824334, + "learning_rate": 4.780424974891629e-06, + "loss": 0.5266, + "step": 8824 + }, + { + "epoch": 0.68, + "grad_norm": 1.1399257031707766, + "learning_rate": 4.778281902882844e-06, + "loss": 0.4544, + "step": 8825 + }, + { + "epoch": 0.68, + "grad_norm": 1.3028203429720102, + "learning_rate": 4.776139160548068e-06, + "loss": 0.5362, + "step": 8826 + }, + { + "epoch": 0.68, + "grad_norm": 1.3395818337550256, + "learning_rate": 4.773996748022573e-06, + "loss": 0.5655, + "step": 8827 + }, + { + "epoch": 0.68, + "grad_norm": 1.2426064222723974, + "learning_rate": 4.771854665441626e-06, + "loss": 0.5484, + "step": 8828 + }, + { + "epoch": 0.68, + "grad_norm": 1.1789879651122728, + "learning_rate": 4.769712912940472e-06, + "loss": 0.5561, + "step": 8829 + }, + { + "epoch": 0.69, + "grad_norm": 1.1415945139358161, + "learning_rate": 4.767571490654322e-06, + "loss": 0.4989, + "step": 8830 + }, + { + "epoch": 0.69, + "grad_norm": 1.2775170911478029, + "learning_rate": 4.76543039871838e-06, + "loss": 0.542, + "step": 8831 + }, + { + "epoch": 0.69, + "grad_norm": 1.1832591172348037, + "learning_rate": 4.7632896372678235e-06, + "loss": 0.5082, + "step": 8832 + }, + { + "epoch": 0.69, + "grad_norm": 1.179159741106128, + "learning_rate": 4.761149206437811e-06, + "loss": 0.5019, + "step": 8833 + }, + { + "epoch": 0.69, + "grad_norm": 1.17375779536282, + "learning_rate": 4.759009106363482e-06, + "loss": 0.5362, + "step": 8834 + }, + { + "epoch": 0.69, + "grad_norm": 1.1856792947888322, + "learning_rate": 4.756869337179942e-06, + "loss": 0.5244, + "step": 8835 + }, + { + "epoch": 0.69, + "grad_norm": 1.247044550331615, + "learning_rate": 4.754729899022298e-06, + "loss": 0.5349, + "step": 8836 + }, + { + "epoch": 0.69, + "grad_norm": 1.1721126061201548, + "learning_rate": 4.752590792025612e-06, + "loss": 0.4768, + "step": 8837 + }, + { + "epoch": 0.69, + "grad_norm": 1.2372419646999036, + "learning_rate": 4.750452016324943e-06, + "loss": 0.5551, + "step": 8838 + }, + { + "epoch": 0.69, + "grad_norm": 1.201077837480701, + "learning_rate": 4.748313572055326e-06, + "loss": 0.5171, + "step": 8839 + }, + { + "epoch": 0.69, + "grad_norm": 1.0790756015330216, + "learning_rate": 4.746175459351765e-06, + "loss": 0.4984, + "step": 8840 + }, + { + "epoch": 0.69, + "grad_norm": 1.1799470224449369, + "learning_rate": 4.744037678349254e-06, + "loss": 0.5413, + "step": 8841 + }, + { + "epoch": 0.69, + "grad_norm": 1.1348939911662688, + "learning_rate": 4.74190022918276e-06, + "loss": 0.514, + "step": 8842 + }, + { + "epoch": 0.69, + "grad_norm": 1.059176351551484, + "learning_rate": 4.7397631119872335e-06, + "loss": 0.5196, + "step": 8843 + }, + { + "epoch": 0.69, + "grad_norm": 1.1550405207870562, + "learning_rate": 4.737626326897604e-06, + "loss": 0.516, + "step": 8844 + }, + { + "epoch": 0.69, + "grad_norm": 1.1138553678289136, + "learning_rate": 4.735489874048772e-06, + "loss": 0.5321, + "step": 8845 + }, + { + "epoch": 0.69, + "grad_norm": 1.1132218729882424, + "learning_rate": 4.733353753575629e-06, + "loss": 0.4097, + "step": 8846 + }, + { + "epoch": 0.69, + "grad_norm": 1.4144234381054221, + "learning_rate": 4.731217965613033e-06, + "loss": 0.5674, + "step": 8847 + }, + { + "epoch": 0.69, + "grad_norm": 1.1599987706638268, + "learning_rate": 4.729082510295829e-06, + "loss": 0.5178, + "step": 8848 + }, + { + "epoch": 0.69, + "grad_norm": 1.2147427728520006, + "learning_rate": 4.726947387758845e-06, + "loss": 0.4921, + "step": 8849 + }, + { + "epoch": 0.69, + "grad_norm": 1.2836594604835394, + "learning_rate": 4.724812598136873e-06, + "loss": 0.5406, + "step": 8850 + }, + { + "epoch": 0.69, + "grad_norm": 1.155998821297543, + "learning_rate": 4.722678141564698e-06, + "loss": 0.521, + "step": 8851 + }, + { + "epoch": 0.69, + "grad_norm": 1.0403337661068388, + "learning_rate": 4.720544018177081e-06, + "loss": 0.4557, + "step": 8852 + }, + { + "epoch": 0.69, + "grad_norm": 1.257351998327825, + "learning_rate": 4.718410228108757e-06, + "loss": 0.5171, + "step": 8853 + }, + { + "epoch": 0.69, + "grad_norm": 1.2262320589703215, + "learning_rate": 4.716276771494449e-06, + "loss": 0.4803, + "step": 8854 + }, + { + "epoch": 0.69, + "grad_norm": 1.2557203533296322, + "learning_rate": 4.714143648468845e-06, + "loss": 0.5221, + "step": 8855 + }, + { + "epoch": 0.69, + "grad_norm": 1.1901707481051114, + "learning_rate": 4.712010859166628e-06, + "loss": 0.5721, + "step": 8856 + }, + { + "epoch": 0.69, + "grad_norm": 1.277576810240922, + "learning_rate": 4.709878403722445e-06, + "loss": 0.5115, + "step": 8857 + }, + { + "epoch": 0.69, + "grad_norm": 1.1938510013213617, + "learning_rate": 4.70774628227093e-06, + "loss": 0.5465, + "step": 8858 + }, + { + "epoch": 0.69, + "grad_norm": 1.17692839466426, + "learning_rate": 4.7056144949467005e-06, + "loss": 0.5193, + "step": 8859 + }, + { + "epoch": 0.69, + "grad_norm": 1.1323715404049608, + "learning_rate": 4.70348304188434e-06, + "loss": 0.5282, + "step": 8860 + }, + { + "epoch": 0.69, + "grad_norm": 1.1541353425734135, + "learning_rate": 4.7013519232184225e-06, + "loss": 0.5003, + "step": 8861 + }, + { + "epoch": 0.69, + "grad_norm": 1.3676767400311258, + "learning_rate": 4.699221139083494e-06, + "loss": 0.5499, + "step": 8862 + }, + { + "epoch": 0.69, + "grad_norm": 1.2077386258568266, + "learning_rate": 4.697090689614084e-06, + "loss": 0.5235, + "step": 8863 + }, + { + "epoch": 0.69, + "grad_norm": 1.2054689042690176, + "learning_rate": 4.6949605749446995e-06, + "loss": 0.5145, + "step": 8864 + }, + { + "epoch": 0.69, + "grad_norm": 1.2237714367743355, + "learning_rate": 4.692830795209826e-06, + "loss": 0.5602, + "step": 8865 + }, + { + "epoch": 0.69, + "grad_norm": 1.1963040596158168, + "learning_rate": 4.690701350543927e-06, + "loss": 0.5039, + "step": 8866 + }, + { + "epoch": 0.69, + "grad_norm": 1.225439298451685, + "learning_rate": 4.6885722410814396e-06, + "loss": 0.5547, + "step": 8867 + }, + { + "epoch": 0.69, + "grad_norm": 1.2616693823636926, + "learning_rate": 4.68644346695679e-06, + "loss": 0.5171, + "step": 8868 + }, + { + "epoch": 0.69, + "grad_norm": 1.3255002283119859, + "learning_rate": 4.684315028304379e-06, + "loss": 0.5759, + "step": 8869 + }, + { + "epoch": 0.69, + "grad_norm": 1.2304056787220903, + "learning_rate": 4.68218692525859e-06, + "loss": 0.5558, + "step": 8870 + }, + { + "epoch": 0.69, + "grad_norm": 1.2182677855560362, + "learning_rate": 4.680059157953772e-06, + "loss": 0.5184, + "step": 8871 + }, + { + "epoch": 0.69, + "grad_norm": 1.218183875033662, + "learning_rate": 4.6779317265242675e-06, + "loss": 0.522, + "step": 8872 + }, + { + "epoch": 0.69, + "grad_norm": 1.226259230521611, + "learning_rate": 4.6758046311043915e-06, + "loss": 0.574, + "step": 8873 + }, + { + "epoch": 0.69, + "grad_norm": 1.2420230972611777, + "learning_rate": 4.673677871828437e-06, + "loss": 0.5738, + "step": 8874 + }, + { + "epoch": 0.69, + "grad_norm": 1.2252305689658154, + "learning_rate": 4.671551448830684e-06, + "loss": 0.5285, + "step": 8875 + }, + { + "epoch": 0.69, + "grad_norm": 1.2394194561424576, + "learning_rate": 4.669425362245378e-06, + "loss": 0.5243, + "step": 8876 + }, + { + "epoch": 0.69, + "grad_norm": 1.2249773801933952, + "learning_rate": 4.667299612206747e-06, + "loss": 0.5395, + "step": 8877 + }, + { + "epoch": 0.69, + "grad_norm": 1.2668172142426282, + "learning_rate": 4.665174198849006e-06, + "loss": 0.541, + "step": 8878 + }, + { + "epoch": 0.69, + "grad_norm": 1.2793023728114212, + "learning_rate": 4.6630491223063415e-06, + "loss": 0.5331, + "step": 8879 + }, + { + "epoch": 0.69, + "grad_norm": 1.1886959077259043, + "learning_rate": 4.660924382712923e-06, + "loss": 0.5406, + "step": 8880 + }, + { + "epoch": 0.69, + "grad_norm": 1.137707467075631, + "learning_rate": 4.658799980202893e-06, + "loss": 0.4997, + "step": 8881 + }, + { + "epoch": 0.69, + "grad_norm": 1.2855292709383201, + "learning_rate": 4.656675914910376e-06, + "loss": 0.5292, + "step": 8882 + }, + { + "epoch": 0.69, + "grad_norm": 1.2334257411275467, + "learning_rate": 4.654552186969477e-06, + "loss": 0.5588, + "step": 8883 + }, + { + "epoch": 0.69, + "grad_norm": 1.1992880338355738, + "learning_rate": 4.6524287965142765e-06, + "loss": 0.5029, + "step": 8884 + }, + { + "epoch": 0.69, + "grad_norm": 1.2504801781573172, + "learning_rate": 4.6503057436788405e-06, + "loss": 0.5656, + "step": 8885 + }, + { + "epoch": 0.69, + "grad_norm": 1.2238899320062235, + "learning_rate": 4.648183028597203e-06, + "loss": 0.513, + "step": 8886 + }, + { + "epoch": 0.69, + "grad_norm": 1.2239236813232504, + "learning_rate": 4.646060651403379e-06, + "loss": 0.4781, + "step": 8887 + }, + { + "epoch": 0.69, + "grad_norm": 1.2229128924717194, + "learning_rate": 4.643938612231369e-06, + "loss": 0.5468, + "step": 8888 + }, + { + "epoch": 0.69, + "grad_norm": 1.1652624614730998, + "learning_rate": 4.641816911215148e-06, + "loss": 0.489, + "step": 8889 + }, + { + "epoch": 0.69, + "grad_norm": 1.1466826441302262, + "learning_rate": 4.639695548488673e-06, + "loss": 0.482, + "step": 8890 + }, + { + "epoch": 0.69, + "grad_norm": 1.181788209888564, + "learning_rate": 4.63757452418587e-06, + "loss": 0.5463, + "step": 8891 + }, + { + "epoch": 0.69, + "grad_norm": 1.2867847548453584, + "learning_rate": 4.635453838440654e-06, + "loss": 0.5273, + "step": 8892 + }, + { + "epoch": 0.69, + "grad_norm": 1.089022255021199, + "learning_rate": 4.633333491386915e-06, + "loss": 0.4706, + "step": 8893 + }, + { + "epoch": 0.69, + "grad_norm": 1.1810854782095423, + "learning_rate": 4.631213483158525e-06, + "loss": 0.475, + "step": 8894 + }, + { + "epoch": 0.69, + "grad_norm": 1.1169802066582672, + "learning_rate": 4.6290938138893225e-06, + "loss": 0.5131, + "step": 8895 + }, + { + "epoch": 0.69, + "grad_norm": 1.2482594292634268, + "learning_rate": 4.626974483713142e-06, + "loss": 0.5399, + "step": 8896 + }, + { + "epoch": 0.69, + "grad_norm": 1.1119416861132356, + "learning_rate": 4.62485549276378e-06, + "loss": 0.5233, + "step": 8897 + }, + { + "epoch": 0.69, + "grad_norm": 1.2294986352001556, + "learning_rate": 4.622736841175023e-06, + "loss": 0.5361, + "step": 8898 + }, + { + "epoch": 0.69, + "grad_norm": 1.146167405874918, + "learning_rate": 4.620618529080632e-06, + "loss": 0.5348, + "step": 8899 + }, + { + "epoch": 0.69, + "grad_norm": 1.1541456197662716, + "learning_rate": 4.61850055661435e-06, + "loss": 0.4828, + "step": 8900 + }, + { + "epoch": 0.69, + "grad_norm": 1.208325528525216, + "learning_rate": 4.61638292390989e-06, + "loss": 0.5133, + "step": 8901 + }, + { + "epoch": 0.69, + "grad_norm": 1.2489954249601358, + "learning_rate": 4.614265631100952e-06, + "loss": 0.5533, + "step": 8902 + }, + { + "epoch": 0.69, + "grad_norm": 1.311399679814041, + "learning_rate": 4.612148678321211e-06, + "loss": 0.5022, + "step": 8903 + }, + { + "epoch": 0.69, + "grad_norm": 1.1329007344577136, + "learning_rate": 4.610032065704325e-06, + "loss": 0.5266, + "step": 8904 + }, + { + "epoch": 0.69, + "grad_norm": 1.213177092749023, + "learning_rate": 4.607915793383921e-06, + "loss": 0.5389, + "step": 8905 + }, + { + "epoch": 0.69, + "grad_norm": 1.2040836923824745, + "learning_rate": 4.605799861493615e-06, + "loss": 0.4958, + "step": 8906 + }, + { + "epoch": 0.69, + "grad_norm": 1.0646999807832698, + "learning_rate": 4.60368427016699e-06, + "loss": 0.5065, + "step": 8907 + }, + { + "epoch": 0.69, + "grad_norm": 1.1537965571308, + "learning_rate": 4.601569019537619e-06, + "loss": 0.5029, + "step": 8908 + }, + { + "epoch": 0.69, + "grad_norm": 1.199606916499002, + "learning_rate": 4.599454109739047e-06, + "loss": 0.4962, + "step": 8909 + }, + { + "epoch": 0.69, + "grad_norm": 1.193114659867559, + "learning_rate": 4.597339540904806e-06, + "loss": 0.5113, + "step": 8910 + }, + { + "epoch": 0.69, + "grad_norm": 1.2220495991950413, + "learning_rate": 4.595225313168389e-06, + "loss": 0.5851, + "step": 8911 + }, + { + "epoch": 0.69, + "grad_norm": 1.2444790510113326, + "learning_rate": 4.5931114266632826e-06, + "loss": 0.5201, + "step": 8912 + }, + { + "epoch": 0.69, + "grad_norm": 1.1684836078941885, + "learning_rate": 4.590997881522948e-06, + "loss": 0.4916, + "step": 8913 + }, + { + "epoch": 0.69, + "grad_norm": 1.1607906777531398, + "learning_rate": 4.588884677880828e-06, + "loss": 0.4283, + "step": 8914 + }, + { + "epoch": 0.69, + "grad_norm": 1.1203906682408602, + "learning_rate": 4.586771815870332e-06, + "loss": 0.5341, + "step": 8915 + }, + { + "epoch": 0.69, + "grad_norm": 1.1136149123926693, + "learning_rate": 4.5846592956248636e-06, + "loss": 0.5302, + "step": 8916 + }, + { + "epoch": 0.69, + "grad_norm": 1.1012851014101719, + "learning_rate": 4.582547117277789e-06, + "loss": 0.45, + "step": 8917 + }, + { + "epoch": 0.69, + "grad_norm": 1.1800675443132653, + "learning_rate": 4.580435280962466e-06, + "loss": 0.5559, + "step": 8918 + }, + { + "epoch": 0.69, + "grad_norm": 1.06715070848172, + "learning_rate": 4.578323786812225e-06, + "loss": 0.495, + "step": 8919 + }, + { + "epoch": 0.69, + "grad_norm": 1.2273294760045803, + "learning_rate": 4.5762126349603755e-06, + "loss": 0.5142, + "step": 8920 + }, + { + "epoch": 0.69, + "grad_norm": 1.1333739895166621, + "learning_rate": 4.574101825540209e-06, + "loss": 0.4715, + "step": 8921 + }, + { + "epoch": 0.69, + "grad_norm": 1.241579062723198, + "learning_rate": 4.571991358684984e-06, + "loss": 0.5607, + "step": 8922 + }, + { + "epoch": 0.69, + "grad_norm": 1.1906023158518209, + "learning_rate": 4.569881234527951e-06, + "loss": 0.5334, + "step": 8923 + }, + { + "epoch": 0.69, + "grad_norm": 1.2194152874648232, + "learning_rate": 4.5677714532023335e-06, + "loss": 0.5325, + "step": 8924 + }, + { + "epoch": 0.69, + "grad_norm": 1.1392613450624556, + "learning_rate": 4.565662014841328e-06, + "loss": 0.5424, + "step": 8925 + }, + { + "epoch": 0.69, + "grad_norm": 1.2520186813258727, + "learning_rate": 4.563552919578119e-06, + "loss": 0.4893, + "step": 8926 + }, + { + "epoch": 0.69, + "grad_norm": 1.1390535686545693, + "learning_rate": 4.561444167545861e-06, + "loss": 0.5007, + "step": 8927 + }, + { + "epoch": 0.69, + "grad_norm": 1.1405801633618593, + "learning_rate": 4.559335758877691e-06, + "loss": 0.5332, + "step": 8928 + }, + { + "epoch": 0.69, + "grad_norm": 1.1148411045698017, + "learning_rate": 4.557227693706724e-06, + "loss": 0.5014, + "step": 8929 + }, + { + "epoch": 0.69, + "grad_norm": 1.0837685859960804, + "learning_rate": 4.555119972166055e-06, + "loss": 0.4503, + "step": 8930 + }, + { + "epoch": 0.69, + "grad_norm": 1.194008957901136, + "learning_rate": 4.553012594388757e-06, + "loss": 0.4709, + "step": 8931 + }, + { + "epoch": 0.69, + "grad_norm": 1.1867817664225282, + "learning_rate": 4.550905560507873e-06, + "loss": 0.4792, + "step": 8932 + }, + { + "epoch": 0.69, + "grad_norm": 1.126105083664196, + "learning_rate": 4.548798870656434e-06, + "loss": 0.4946, + "step": 8933 + }, + { + "epoch": 0.69, + "grad_norm": 1.207783930392495, + "learning_rate": 4.546692524967452e-06, + "loss": 0.4909, + "step": 8934 + }, + { + "epoch": 0.69, + "grad_norm": 1.2831898867128917, + "learning_rate": 4.544586523573902e-06, + "loss": 0.5173, + "step": 8935 + }, + { + "epoch": 0.69, + "grad_norm": 1.1978771645487492, + "learning_rate": 4.542480866608754e-06, + "loss": 0.4947, + "step": 8936 + }, + { + "epoch": 0.69, + "grad_norm": 1.2085342185432035, + "learning_rate": 4.540375554204944e-06, + "loss": 0.4989, + "step": 8937 + }, + { + "epoch": 0.69, + "grad_norm": 1.3025025214319115, + "learning_rate": 4.5382705864953915e-06, + "loss": 0.5308, + "step": 8938 + }, + { + "epoch": 0.69, + "grad_norm": 1.1459573447426146, + "learning_rate": 4.536165963612995e-06, + "loss": 0.5085, + "step": 8939 + }, + { + "epoch": 0.69, + "grad_norm": 1.1297496562230431, + "learning_rate": 4.534061685690633e-06, + "loss": 0.4663, + "step": 8940 + }, + { + "epoch": 0.69, + "grad_norm": 1.1866729265397942, + "learning_rate": 4.531957752861161e-06, + "loss": 0.5364, + "step": 8941 + }, + { + "epoch": 0.69, + "grad_norm": 1.0903828152533144, + "learning_rate": 4.5298541652574016e-06, + "loss": 0.4813, + "step": 8942 + }, + { + "epoch": 0.69, + "grad_norm": 1.130244481553155, + "learning_rate": 4.527750923012172e-06, + "loss": 0.5402, + "step": 8943 + }, + { + "epoch": 0.69, + "grad_norm": 1.2343152671557724, + "learning_rate": 4.525648026258264e-06, + "loss": 0.5803, + "step": 8944 + }, + { + "epoch": 0.69, + "grad_norm": 1.2548806750802566, + "learning_rate": 4.5235454751284355e-06, + "loss": 0.4966, + "step": 8945 + }, + { + "epoch": 0.69, + "grad_norm": 1.2473058754399593, + "learning_rate": 4.52144326975544e-06, + "loss": 0.5163, + "step": 8946 + }, + { + "epoch": 0.69, + "grad_norm": 1.2336144337481356, + "learning_rate": 4.5193414102719935e-06, + "loss": 0.5444, + "step": 8947 + }, + { + "epoch": 0.69, + "grad_norm": 1.1298813881696133, + "learning_rate": 4.5172398968108e-06, + "loss": 0.4969, + "step": 8948 + }, + { + "epoch": 0.69, + "grad_norm": 1.1866582597656203, + "learning_rate": 4.515138729504539e-06, + "loss": 0.5538, + "step": 8949 + }, + { + "epoch": 0.69, + "grad_norm": 1.1886517811661692, + "learning_rate": 4.513037908485868e-06, + "loss": 0.496, + "step": 8950 + }, + { + "epoch": 0.69, + "grad_norm": 1.1325533340223817, + "learning_rate": 4.5109374338874254e-06, + "loss": 0.5065, + "step": 8951 + }, + { + "epoch": 0.69, + "grad_norm": 1.1962072975086617, + "learning_rate": 4.508837305841821e-06, + "loss": 0.4837, + "step": 8952 + }, + { + "epoch": 0.69, + "grad_norm": 1.0876591160444244, + "learning_rate": 4.506737524481647e-06, + "loss": 0.4869, + "step": 8953 + }, + { + "epoch": 0.69, + "grad_norm": 1.183129650818177, + "learning_rate": 4.504638089939478e-06, + "loss": 0.5118, + "step": 8954 + }, + { + "epoch": 0.69, + "grad_norm": 1.1534481123128846, + "learning_rate": 4.502539002347856e-06, + "loss": 0.4832, + "step": 8955 + }, + { + "epoch": 0.69, + "grad_norm": 1.0818674672799473, + "learning_rate": 4.500440261839313e-06, + "loss": 0.5361, + "step": 8956 + }, + { + "epoch": 0.69, + "grad_norm": 1.1698920703078248, + "learning_rate": 4.498341868546347e-06, + "loss": 0.4734, + "step": 8957 + }, + { + "epoch": 0.69, + "grad_norm": 1.1398021917892027, + "learning_rate": 4.496243822601443e-06, + "loss": 0.5307, + "step": 8958 + }, + { + "epoch": 0.7, + "grad_norm": 1.116735567035923, + "learning_rate": 4.494146124137062e-06, + "loss": 0.5238, + "step": 8959 + }, + { + "epoch": 0.7, + "grad_norm": 1.103240014150979, + "learning_rate": 4.4920487732856425e-06, + "loss": 0.4728, + "step": 8960 + }, + { + "epoch": 0.7, + "grad_norm": 1.1926066870319436, + "learning_rate": 4.489951770179606e-06, + "loss": 0.5121, + "step": 8961 + }, + { + "epoch": 0.7, + "grad_norm": 1.2199523570185355, + "learning_rate": 4.487855114951337e-06, + "loss": 0.503, + "step": 8962 + }, + { + "epoch": 0.7, + "grad_norm": 1.20279337288249, + "learning_rate": 4.485758807733215e-06, + "loss": 0.5601, + "step": 8963 + }, + { + "epoch": 0.7, + "grad_norm": 1.368473298483761, + "learning_rate": 4.483662848657593e-06, + "loss": 0.5541, + "step": 8964 + }, + { + "epoch": 0.7, + "grad_norm": 1.294734705405784, + "learning_rate": 4.481567237856792e-06, + "loss": 0.5298, + "step": 8965 + }, + { + "epoch": 0.7, + "grad_norm": 1.166117090254619, + "learning_rate": 4.479471975463125e-06, + "loss": 0.4939, + "step": 8966 + }, + { + "epoch": 0.7, + "grad_norm": 1.1123919230908959, + "learning_rate": 4.477377061608873e-06, + "loss": 0.5006, + "step": 8967 + }, + { + "epoch": 0.7, + "grad_norm": 1.2363939312962728, + "learning_rate": 4.475282496426298e-06, + "loss": 0.4813, + "step": 8968 + }, + { + "epoch": 0.7, + "grad_norm": 1.2289990079082709, + "learning_rate": 4.473188280047644e-06, + "loss": 0.5464, + "step": 8969 + }, + { + "epoch": 0.7, + "grad_norm": 1.2900006666107822, + "learning_rate": 4.471094412605128e-06, + "loss": 0.4877, + "step": 8970 + }, + { + "epoch": 0.7, + "grad_norm": 1.2796262360790707, + "learning_rate": 4.469000894230947e-06, + "loss": 0.5291, + "step": 8971 + }, + { + "epoch": 0.7, + "grad_norm": 1.438104419519546, + "learning_rate": 4.46690772505728e-06, + "loss": 0.5639, + "step": 8972 + }, + { + "epoch": 0.7, + "grad_norm": 1.143021456793221, + "learning_rate": 4.464814905216271e-06, + "loss": 0.48, + "step": 8973 + }, + { + "epoch": 0.7, + "grad_norm": 1.2237088483490053, + "learning_rate": 4.46272243484006e-06, + "loss": 0.5276, + "step": 8974 + }, + { + "epoch": 0.7, + "grad_norm": 1.1318541583749597, + "learning_rate": 4.4606303140607456e-06, + "loss": 0.514, + "step": 8975 + }, + { + "epoch": 0.7, + "grad_norm": 1.2253948412242925, + "learning_rate": 4.458538543010418e-06, + "loss": 0.5543, + "step": 8976 + }, + { + "epoch": 0.7, + "grad_norm": 1.119878662042073, + "learning_rate": 4.456447121821147e-06, + "loss": 0.5048, + "step": 8977 + }, + { + "epoch": 0.7, + "grad_norm": 1.1276534894180283, + "learning_rate": 4.454356050624966e-06, + "loss": 0.493, + "step": 8978 + }, + { + "epoch": 0.7, + "grad_norm": 1.234279966921267, + "learning_rate": 4.4522653295539e-06, + "loss": 0.5141, + "step": 8979 + }, + { + "epoch": 0.7, + "grad_norm": 1.161801799173484, + "learning_rate": 4.450174958739945e-06, + "loss": 0.4985, + "step": 8980 + }, + { + "epoch": 0.7, + "grad_norm": 1.2165572663769564, + "learning_rate": 4.448084938315079e-06, + "loss": 0.5171, + "step": 8981 + }, + { + "epoch": 0.7, + "grad_norm": 1.2023938917743457, + "learning_rate": 4.4459952684112596e-06, + "loss": 0.5307, + "step": 8982 + }, + { + "epoch": 0.7, + "grad_norm": 1.1567017987070833, + "learning_rate": 4.443905949160409e-06, + "loss": 0.5185, + "step": 8983 + }, + { + "epoch": 0.7, + "grad_norm": 1.1106023849048148, + "learning_rate": 4.441816980694446e-06, + "loss": 0.5237, + "step": 8984 + }, + { + "epoch": 0.7, + "grad_norm": 1.1334221613424291, + "learning_rate": 4.439728363145248e-06, + "loss": 0.4591, + "step": 8985 + }, + { + "epoch": 0.7, + "grad_norm": 1.1269532307801617, + "learning_rate": 4.437640096644687e-06, + "loss": 0.4717, + "step": 8986 + }, + { + "epoch": 0.7, + "grad_norm": 1.128895162244155, + "learning_rate": 4.435552181324608e-06, + "loss": 0.5089, + "step": 8987 + }, + { + "epoch": 0.7, + "grad_norm": 1.3376907087890968, + "learning_rate": 4.433464617316825e-06, + "loss": 0.5502, + "step": 8988 + }, + { + "epoch": 0.7, + "grad_norm": 1.2668222016076862, + "learning_rate": 4.431377404753141e-06, + "loss": 0.5537, + "step": 8989 + }, + { + "epoch": 0.7, + "grad_norm": 1.1287194331731123, + "learning_rate": 4.429290543765331e-06, + "loss": 0.4858, + "step": 8990 + }, + { + "epoch": 0.7, + "grad_norm": 1.198636538055483, + "learning_rate": 4.42720403448515e-06, + "loss": 0.4949, + "step": 8991 + }, + { + "epoch": 0.7, + "grad_norm": 1.2931505519136397, + "learning_rate": 4.4251178770443334e-06, + "loss": 0.5091, + "step": 8992 + }, + { + "epoch": 0.7, + "grad_norm": 1.286485210966762, + "learning_rate": 4.4230320715745855e-06, + "loss": 0.5279, + "step": 8993 + }, + { + "epoch": 0.7, + "grad_norm": 1.1815235426724866, + "learning_rate": 4.420946618207599e-06, + "loss": 0.5304, + "step": 8994 + }, + { + "epoch": 0.7, + "grad_norm": 1.0962344154807344, + "learning_rate": 4.418861517075034e-06, + "loss": 0.4803, + "step": 8995 + }, + { + "epoch": 0.7, + "grad_norm": 1.137327208337767, + "learning_rate": 4.416776768308535e-06, + "loss": 0.4889, + "step": 8996 + }, + { + "epoch": 0.7, + "grad_norm": 1.2428856093571454, + "learning_rate": 4.4146923720397285e-06, + "loss": 0.5146, + "step": 8997 + }, + { + "epoch": 0.7, + "grad_norm": 1.2433412100531351, + "learning_rate": 4.412608328400205e-06, + "loss": 0.5488, + "step": 8998 + }, + { + "epoch": 0.7, + "grad_norm": 1.1965893676004902, + "learning_rate": 4.410524637521545e-06, + "loss": 0.489, + "step": 8999 + }, + { + "epoch": 0.7, + "grad_norm": 1.1757382103022482, + "learning_rate": 4.408441299535302e-06, + "loss": 0.5419, + "step": 9000 + }, + { + "epoch": 0.7, + "grad_norm": 1.1486985824584184, + "learning_rate": 4.406358314573009e-06, + "loss": 0.525, + "step": 9001 + }, + { + "epoch": 0.7, + "grad_norm": 1.2258168280393857, + "learning_rate": 4.404275682766179e-06, + "loss": 0.5373, + "step": 9002 + }, + { + "epoch": 0.7, + "grad_norm": 1.227782741932641, + "learning_rate": 4.402193404246291e-06, + "loss": 0.5119, + "step": 9003 + }, + { + "epoch": 0.7, + "grad_norm": 1.3380673925194606, + "learning_rate": 4.400111479144818e-06, + "loss": 0.5753, + "step": 9004 + }, + { + "epoch": 0.7, + "grad_norm": 1.3106562970981084, + "learning_rate": 4.398029907593197e-06, + "loss": 0.5248, + "step": 9005 + }, + { + "epoch": 0.7, + "grad_norm": 1.226884593100216, + "learning_rate": 4.395948689722847e-06, + "loss": 0.5076, + "step": 9006 + }, + { + "epoch": 0.7, + "grad_norm": 1.1029174263584895, + "learning_rate": 4.393867825665176e-06, + "loss": 0.4753, + "step": 9007 + }, + { + "epoch": 0.7, + "grad_norm": 1.2616214773751981, + "learning_rate": 4.391787315551548e-06, + "loss": 0.5051, + "step": 9008 + }, + { + "epoch": 0.7, + "grad_norm": 1.2207651351437323, + "learning_rate": 4.389707159513321e-06, + "loss": 0.5314, + "step": 9009 + }, + { + "epoch": 0.7, + "grad_norm": 1.2251088463829816, + "learning_rate": 4.387627357681827e-06, + "loss": 0.5467, + "step": 9010 + }, + { + "epoch": 0.7, + "grad_norm": 1.343512536291729, + "learning_rate": 4.3855479101883735e-06, + "loss": 0.4916, + "step": 9011 + }, + { + "epoch": 0.7, + "grad_norm": 1.367318762881241, + "learning_rate": 4.383468817164251e-06, + "loss": 0.5804, + "step": 9012 + }, + { + "epoch": 0.7, + "grad_norm": 1.1305766172162948, + "learning_rate": 4.3813900787407175e-06, + "loss": 0.5422, + "step": 9013 + }, + { + "epoch": 0.7, + "grad_norm": 1.2445914561046298, + "learning_rate": 4.379311695049018e-06, + "loss": 0.5362, + "step": 9014 + }, + { + "epoch": 0.7, + "grad_norm": 1.1317631037319846, + "learning_rate": 4.377233666220368e-06, + "loss": 0.478, + "step": 9015 + }, + { + "epoch": 0.7, + "grad_norm": 1.1509479409748156, + "learning_rate": 4.375155992385965e-06, + "loss": 0.5206, + "step": 9016 + }, + { + "epoch": 0.7, + "grad_norm": 1.2466173178209359, + "learning_rate": 4.373078673676988e-06, + "loss": 0.5729, + "step": 9017 + }, + { + "epoch": 0.7, + "grad_norm": 1.1610129946655845, + "learning_rate": 4.371001710224583e-06, + "loss": 0.5047, + "step": 9018 + }, + { + "epoch": 0.7, + "grad_norm": 1.3180130567265718, + "learning_rate": 4.368925102159881e-06, + "loss": 0.6043, + "step": 9019 + }, + { + "epoch": 0.7, + "grad_norm": 1.2063429267943142, + "learning_rate": 4.36684884961399e-06, + "loss": 0.5395, + "step": 9020 + }, + { + "epoch": 0.7, + "grad_norm": 1.063843830959475, + "learning_rate": 4.364772952717995e-06, + "loss": 0.4738, + "step": 9021 + }, + { + "epoch": 0.7, + "grad_norm": 1.2126084642957016, + "learning_rate": 4.362697411602961e-06, + "loss": 0.5186, + "step": 9022 + }, + { + "epoch": 0.7, + "grad_norm": 1.3467054468337225, + "learning_rate": 4.36062222639992e-06, + "loss": 0.5187, + "step": 9023 + }, + { + "epoch": 0.7, + "grad_norm": 1.008991350422317, + "learning_rate": 4.358547397239896e-06, + "loss": 0.4546, + "step": 9024 + }, + { + "epoch": 0.7, + "grad_norm": 1.1955933583724399, + "learning_rate": 4.356472924253878e-06, + "loss": 0.4827, + "step": 9025 + }, + { + "epoch": 0.7, + "grad_norm": 1.167043466119726, + "learning_rate": 4.354398807572841e-06, + "loss": 0.5093, + "step": 9026 + }, + { + "epoch": 0.7, + "grad_norm": 1.1582693045687293, + "learning_rate": 4.352325047327735e-06, + "loss": 0.4977, + "step": 9027 + }, + { + "epoch": 0.7, + "grad_norm": 1.1859898752902895, + "learning_rate": 4.350251643649491e-06, + "loss": 0.5098, + "step": 9028 + }, + { + "epoch": 0.7, + "grad_norm": 1.1979494118015914, + "learning_rate": 4.348178596669006e-06, + "loss": 0.5576, + "step": 9029 + }, + { + "epoch": 0.7, + "grad_norm": 1.1964071908354001, + "learning_rate": 4.346105906517165e-06, + "loss": 0.5045, + "step": 9030 + }, + { + "epoch": 0.7, + "grad_norm": 1.0921139606390746, + "learning_rate": 4.344033573324829e-06, + "loss": 0.5427, + "step": 9031 + }, + { + "epoch": 0.7, + "grad_norm": 1.1642329456046006, + "learning_rate": 4.341961597222837e-06, + "loss": 0.517, + "step": 9032 + }, + { + "epoch": 0.7, + "grad_norm": 1.2031739955685403, + "learning_rate": 4.339889978341998e-06, + "loss": 0.5285, + "step": 9033 + }, + { + "epoch": 0.7, + "grad_norm": 1.1498781367708157, + "learning_rate": 4.337818716813112e-06, + "loss": 0.4556, + "step": 9034 + }, + { + "epoch": 0.7, + "grad_norm": 1.1543747412532068, + "learning_rate": 4.3357478127669376e-06, + "loss": 0.5595, + "step": 9035 + }, + { + "epoch": 0.7, + "grad_norm": 1.1339921793995993, + "learning_rate": 4.333677266334228e-06, + "loss": 0.5066, + "step": 9036 + }, + { + "epoch": 0.7, + "grad_norm": 1.143158072634017, + "learning_rate": 4.331607077645708e-06, + "loss": 0.5123, + "step": 9037 + }, + { + "epoch": 0.7, + "grad_norm": 1.130208145821317, + "learning_rate": 4.329537246832081e-06, + "loss": 0.5337, + "step": 9038 + }, + { + "epoch": 0.7, + "grad_norm": 1.2730728282547898, + "learning_rate": 4.32746777402402e-06, + "loss": 0.569, + "step": 9039 + }, + { + "epoch": 0.7, + "grad_norm": 1.2290372726524488, + "learning_rate": 4.325398659352185e-06, + "loss": 0.5115, + "step": 9040 + }, + { + "epoch": 0.7, + "grad_norm": 1.1891656036390024, + "learning_rate": 4.32332990294721e-06, + "loss": 0.5049, + "step": 9041 + }, + { + "epoch": 0.7, + "grad_norm": 1.1179078987243145, + "learning_rate": 4.321261504939709e-06, + "loss": 0.4646, + "step": 9042 + }, + { + "epoch": 0.7, + "grad_norm": 1.1863923176972653, + "learning_rate": 4.319193465460264e-06, + "loss": 0.4984, + "step": 9043 + }, + { + "epoch": 0.7, + "grad_norm": 1.1185951168319976, + "learning_rate": 4.317125784639447e-06, + "loss": 0.5301, + "step": 9044 + }, + { + "epoch": 0.7, + "grad_norm": 1.176268315106781, + "learning_rate": 4.315058462607798e-06, + "loss": 0.5121, + "step": 9045 + }, + { + "epoch": 0.7, + "grad_norm": 1.2470900997781682, + "learning_rate": 4.312991499495838e-06, + "loss": 0.4866, + "step": 9046 + }, + { + "epoch": 0.7, + "grad_norm": 1.1629646808184575, + "learning_rate": 4.3109248954340644e-06, + "loss": 0.5193, + "step": 9047 + }, + { + "epoch": 0.7, + "grad_norm": 1.2061735395796462, + "learning_rate": 4.3088586505529584e-06, + "loss": 0.5191, + "step": 9048 + }, + { + "epoch": 0.7, + "grad_norm": 1.2681183920021164, + "learning_rate": 4.306792764982964e-06, + "loss": 0.5095, + "step": 9049 + }, + { + "epoch": 0.7, + "grad_norm": 1.2479141952838317, + "learning_rate": 4.304727238854517e-06, + "loss": 0.5393, + "step": 9050 + }, + { + "epoch": 0.7, + "grad_norm": 1.1450066734519282, + "learning_rate": 4.302662072298022e-06, + "loss": 0.5013, + "step": 9051 + }, + { + "epoch": 0.7, + "grad_norm": 1.0914050441508638, + "learning_rate": 4.300597265443869e-06, + "loss": 0.4534, + "step": 9052 + }, + { + "epoch": 0.7, + "grad_norm": 1.2837500488073235, + "learning_rate": 4.298532818422411e-06, + "loss": 0.5182, + "step": 9053 + }, + { + "epoch": 0.7, + "grad_norm": 1.2041853653579049, + "learning_rate": 4.296468731363996e-06, + "loss": 0.5444, + "step": 9054 + }, + { + "epoch": 0.7, + "grad_norm": 1.21620430761527, + "learning_rate": 4.294405004398933e-06, + "loss": 0.5009, + "step": 9055 + }, + { + "epoch": 0.7, + "grad_norm": 1.0903579975321351, + "learning_rate": 4.292341637657519e-06, + "loss": 0.4994, + "step": 9056 + }, + { + "epoch": 0.7, + "grad_norm": 1.2316055133144848, + "learning_rate": 4.290278631270025e-06, + "loss": 0.5261, + "step": 9057 + }, + { + "epoch": 0.7, + "grad_norm": 1.0141801974465687, + "learning_rate": 4.288215985366702e-06, + "loss": 0.4315, + "step": 9058 + }, + { + "epoch": 0.7, + "grad_norm": 1.0767958367420276, + "learning_rate": 4.286153700077771e-06, + "loss": 0.5066, + "step": 9059 + }, + { + "epoch": 0.7, + "grad_norm": 1.177301988576125, + "learning_rate": 4.284091775533436e-06, + "loss": 0.5125, + "step": 9060 + }, + { + "epoch": 0.7, + "grad_norm": 1.17649758655059, + "learning_rate": 4.282030211863876e-06, + "loss": 0.4898, + "step": 9061 + }, + { + "epoch": 0.7, + "grad_norm": 1.182321550885035, + "learning_rate": 4.279969009199254e-06, + "loss": 0.5145, + "step": 9062 + }, + { + "epoch": 0.7, + "grad_norm": 1.051888488755576, + "learning_rate": 4.277908167669696e-06, + "loss": 0.4722, + "step": 9063 + }, + { + "epoch": 0.7, + "grad_norm": 1.1668018137901053, + "learning_rate": 4.275847687405323e-06, + "loss": 0.4904, + "step": 9064 + }, + { + "epoch": 0.7, + "grad_norm": 1.2407152580271852, + "learning_rate": 4.273787568536212e-06, + "loss": 0.5195, + "step": 9065 + }, + { + "epoch": 0.7, + "grad_norm": 1.1544091804206766, + "learning_rate": 4.271727811192437e-06, + "loss": 0.5086, + "step": 9066 + }, + { + "epoch": 0.7, + "grad_norm": 1.2598565593694218, + "learning_rate": 4.269668415504038e-06, + "loss": 0.5537, + "step": 9067 + }, + { + "epoch": 0.7, + "grad_norm": 1.2668573949086492, + "learning_rate": 4.2676093816010415e-06, + "loss": 0.5703, + "step": 9068 + }, + { + "epoch": 0.7, + "grad_norm": 1.1430526921599626, + "learning_rate": 4.265550709613435e-06, + "loss": 0.507, + "step": 9069 + }, + { + "epoch": 0.7, + "grad_norm": 1.252846718803969, + "learning_rate": 4.263492399671198e-06, + "loss": 0.5123, + "step": 9070 + }, + { + "epoch": 0.7, + "grad_norm": 1.3555196703050318, + "learning_rate": 4.261434451904284e-06, + "loss": 0.5459, + "step": 9071 + }, + { + "epoch": 0.7, + "grad_norm": 1.229745028360689, + "learning_rate": 4.259376866442623e-06, + "loss": 0.4873, + "step": 9072 + }, + { + "epoch": 0.7, + "grad_norm": 1.255695955313812, + "learning_rate": 4.2573196434161135e-06, + "loss": 0.5207, + "step": 9073 + }, + { + "epoch": 0.7, + "grad_norm": 1.1803718754360313, + "learning_rate": 4.255262782954648e-06, + "loss": 0.5578, + "step": 9074 + }, + { + "epoch": 0.7, + "grad_norm": 1.2344349713786555, + "learning_rate": 4.253206285188079e-06, + "loss": 0.5368, + "step": 9075 + }, + { + "epoch": 0.7, + "grad_norm": 1.4108657953876795, + "learning_rate": 4.251150150246245e-06, + "loss": 0.575, + "step": 9076 + }, + { + "epoch": 0.7, + "grad_norm": 1.1229832374123374, + "learning_rate": 4.249094378258962e-06, + "loss": 0.4783, + "step": 9077 + }, + { + "epoch": 0.7, + "grad_norm": 1.3340768628431836, + "learning_rate": 4.247038969356027e-06, + "loss": 0.4533, + "step": 9078 + }, + { + "epoch": 0.7, + "grad_norm": 1.1259958839578421, + "learning_rate": 4.244983923667199e-06, + "loss": 0.4871, + "step": 9079 + }, + { + "epoch": 0.7, + "grad_norm": 1.2534941474403605, + "learning_rate": 4.242929241322228e-06, + "loss": 0.5428, + "step": 9080 + }, + { + "epoch": 0.7, + "grad_norm": 1.213416091686869, + "learning_rate": 4.2408749224508365e-06, + "loss": 0.5219, + "step": 9081 + }, + { + "epoch": 0.7, + "grad_norm": 1.2258433280673946, + "learning_rate": 4.238820967182727e-06, + "loss": 0.5277, + "step": 9082 + }, + { + "epoch": 0.7, + "grad_norm": 1.2000336503238085, + "learning_rate": 4.236767375647572e-06, + "loss": 0.5672, + "step": 9083 + }, + { + "epoch": 0.7, + "grad_norm": 1.2368155869858863, + "learning_rate": 4.234714147975029e-06, + "loss": 0.4728, + "step": 9084 + }, + { + "epoch": 0.7, + "grad_norm": 1.2055063337003815, + "learning_rate": 4.2326612842947225e-06, + "loss": 0.4983, + "step": 9085 + }, + { + "epoch": 0.7, + "grad_norm": 1.1654620366044424, + "learning_rate": 4.230608784736267e-06, + "loss": 0.4837, + "step": 9086 + }, + { + "epoch": 0.7, + "grad_norm": 1.060445313482052, + "learning_rate": 4.228556649429243e-06, + "loss": 0.4678, + "step": 9087 + }, + { + "epoch": 0.71, + "grad_norm": 1.2776551406629741, + "learning_rate": 4.226504878503215e-06, + "loss": 0.5178, + "step": 9088 + }, + { + "epoch": 0.71, + "grad_norm": 1.299215066965357, + "learning_rate": 4.224453472087725e-06, + "loss": 0.5927, + "step": 9089 + }, + { + "epoch": 0.71, + "grad_norm": 1.2028548692915406, + "learning_rate": 4.2224024303122826e-06, + "loss": 0.5724, + "step": 9090 + }, + { + "epoch": 0.71, + "grad_norm": 1.1625284929782964, + "learning_rate": 4.220351753306382e-06, + "loss": 0.5354, + "step": 9091 + }, + { + "epoch": 0.71, + "grad_norm": 1.2092231674892457, + "learning_rate": 4.218301441199499e-06, + "loss": 0.498, + "step": 9092 + }, + { + "epoch": 0.71, + "grad_norm": 1.2556249895186018, + "learning_rate": 4.216251494121071e-06, + "loss": 0.5134, + "step": 9093 + }, + { + "epoch": 0.71, + "grad_norm": 1.190240959246921, + "learning_rate": 4.2142019122005295e-06, + "loss": 0.5008, + "step": 9094 + }, + { + "epoch": 0.71, + "grad_norm": 1.2247433627140112, + "learning_rate": 4.21215269556727e-06, + "loss": 0.5458, + "step": 9095 + }, + { + "epoch": 0.71, + "grad_norm": 1.1163885821258495, + "learning_rate": 4.210103844350671e-06, + "loss": 0.5175, + "step": 9096 + }, + { + "epoch": 0.71, + "grad_norm": 1.226696031433189, + "learning_rate": 4.208055358680089e-06, + "loss": 0.5964, + "step": 9097 + }, + { + "epoch": 0.71, + "grad_norm": 1.1163335884639174, + "learning_rate": 4.2060072386848535e-06, + "loss": 0.5178, + "step": 9098 + }, + { + "epoch": 0.71, + "grad_norm": 1.1274637159781098, + "learning_rate": 4.20395948449428e-06, + "loss": 0.542, + "step": 9099 + }, + { + "epoch": 0.71, + "grad_norm": 1.1440689663572732, + "learning_rate": 4.201912096237643e-06, + "loss": 0.5457, + "step": 9100 + }, + { + "epoch": 0.71, + "grad_norm": 1.169601422015864, + "learning_rate": 4.1998650740442096e-06, + "loss": 0.5159, + "step": 9101 + }, + { + "epoch": 0.71, + "grad_norm": 1.284009938207532, + "learning_rate": 4.197818418043221e-06, + "loss": 0.5242, + "step": 9102 + }, + { + "epoch": 0.71, + "grad_norm": 1.236423482727802, + "learning_rate": 4.19577212836389e-06, + "loss": 0.5448, + "step": 9103 + }, + { + "epoch": 0.71, + "grad_norm": 1.31498469448973, + "learning_rate": 4.193726205135412e-06, + "loss": 0.5721, + "step": 9104 + }, + { + "epoch": 0.71, + "grad_norm": 1.2688654635652281, + "learning_rate": 4.191680648486952e-06, + "loss": 0.5855, + "step": 9105 + }, + { + "epoch": 0.71, + "grad_norm": 1.2228991964848057, + "learning_rate": 4.18963545854766e-06, + "loss": 0.5339, + "step": 9106 + }, + { + "epoch": 0.71, + "grad_norm": 1.2019483088265752, + "learning_rate": 4.187590635446659e-06, + "loss": 0.4596, + "step": 9107 + }, + { + "epoch": 0.71, + "grad_norm": 1.2065018168663766, + "learning_rate": 4.185546179313049e-06, + "loss": 0.5293, + "step": 9108 + }, + { + "epoch": 0.71, + "grad_norm": 1.1991801302279743, + "learning_rate": 4.183502090275911e-06, + "loss": 0.536, + "step": 9109 + }, + { + "epoch": 0.71, + "grad_norm": 1.115730402479264, + "learning_rate": 4.181458368464293e-06, + "loss": 0.5061, + "step": 9110 + }, + { + "epoch": 0.71, + "grad_norm": 1.1519502646849555, + "learning_rate": 4.179415014007227e-06, + "loss": 0.5176, + "step": 9111 + }, + { + "epoch": 0.71, + "grad_norm": 1.3001728181410255, + "learning_rate": 4.177372027033724e-06, + "loss": 0.5684, + "step": 9112 + }, + { + "epoch": 0.71, + "grad_norm": 1.3123773335854758, + "learning_rate": 4.175329407672763e-06, + "loss": 0.5427, + "step": 9113 + }, + { + "epoch": 0.71, + "grad_norm": 1.2562943768610535, + "learning_rate": 4.1732871560533105e-06, + "loss": 0.5222, + "step": 9114 + }, + { + "epoch": 0.71, + "grad_norm": 1.1608950128392148, + "learning_rate": 4.1712452723043e-06, + "loss": 0.5034, + "step": 9115 + }, + { + "epoch": 0.71, + "grad_norm": 1.1486048673782692, + "learning_rate": 4.169203756554646e-06, + "loss": 0.5158, + "step": 9116 + }, + { + "epoch": 0.71, + "grad_norm": 1.2299901122184236, + "learning_rate": 4.167162608933243e-06, + "loss": 0.5481, + "step": 9117 + }, + { + "epoch": 0.71, + "grad_norm": 1.2585822172956644, + "learning_rate": 4.1651218295689576e-06, + "loss": 0.5697, + "step": 9118 + }, + { + "epoch": 0.71, + "grad_norm": 1.0749486489562008, + "learning_rate": 4.163081418590639e-06, + "loss": 0.4492, + "step": 9119 + }, + { + "epoch": 0.71, + "grad_norm": 1.214341185408686, + "learning_rate": 4.1610413761271005e-06, + "loss": 0.4727, + "step": 9120 + }, + { + "epoch": 0.71, + "grad_norm": 1.228963118472335, + "learning_rate": 4.159001702307146e-06, + "loss": 0.5474, + "step": 9121 + }, + { + "epoch": 0.71, + "grad_norm": 1.1330237356091868, + "learning_rate": 4.156962397259553e-06, + "loss": 0.5053, + "step": 9122 + }, + { + "epoch": 0.71, + "grad_norm": 1.2549235652733375, + "learning_rate": 4.154923461113066e-06, + "loss": 0.5257, + "step": 9123 + }, + { + "epoch": 0.71, + "grad_norm": 1.2300608125245454, + "learning_rate": 4.152884893996421e-06, + "loss": 0.5583, + "step": 9124 + }, + { + "epoch": 0.71, + "grad_norm": 1.4199510909777995, + "learning_rate": 4.1508466960383165e-06, + "loss": 0.5789, + "step": 9125 + }, + { + "epoch": 0.71, + "grad_norm": 1.2658145432943335, + "learning_rate": 4.148808867367438e-06, + "loss": 0.5786, + "step": 9126 + }, + { + "epoch": 0.71, + "grad_norm": 1.32282018935476, + "learning_rate": 4.146771408112443e-06, + "loss": 0.5474, + "step": 9127 + }, + { + "epoch": 0.71, + "grad_norm": 1.1603524912284828, + "learning_rate": 4.144734318401969e-06, + "loss": 0.4861, + "step": 9128 + }, + { + "epoch": 0.71, + "grad_norm": 1.3406056276002114, + "learning_rate": 4.14269759836463e-06, + "loss": 0.5396, + "step": 9129 + }, + { + "epoch": 0.71, + "grad_norm": 1.1821412090179717, + "learning_rate": 4.1406612481290066e-06, + "loss": 0.5314, + "step": 9130 + }, + { + "epoch": 0.71, + "grad_norm": 1.1814413107502952, + "learning_rate": 4.138625267823669e-06, + "loss": 0.527, + "step": 9131 + }, + { + "epoch": 0.71, + "grad_norm": 1.1621023482422803, + "learning_rate": 4.136589657577164e-06, + "loss": 0.5375, + "step": 9132 + }, + { + "epoch": 0.71, + "grad_norm": 1.0597174054546443, + "learning_rate": 4.134554417518001e-06, + "loss": 0.4648, + "step": 9133 + }, + { + "epoch": 0.71, + "grad_norm": 1.2100368965450459, + "learning_rate": 4.132519547774678e-06, + "loss": 0.4732, + "step": 9134 + }, + { + "epoch": 0.71, + "grad_norm": 1.1344728601033054, + "learning_rate": 4.130485048475673e-06, + "loss": 0.4928, + "step": 9135 + }, + { + "epoch": 0.71, + "grad_norm": 1.1242668624030703, + "learning_rate": 4.128450919749426e-06, + "loss": 0.5065, + "step": 9136 + }, + { + "epoch": 0.71, + "grad_norm": 1.183035841785023, + "learning_rate": 4.1264171617243655e-06, + "loss": 0.4838, + "step": 9137 + }, + { + "epoch": 0.71, + "grad_norm": 1.303411122112108, + "learning_rate": 4.124383774528893e-06, + "loss": 0.5825, + "step": 9138 + }, + { + "epoch": 0.71, + "grad_norm": 1.0176304311360211, + "learning_rate": 4.122350758291387e-06, + "loss": 0.449, + "step": 9139 + }, + { + "epoch": 0.71, + "grad_norm": 1.0302434112367764, + "learning_rate": 4.120318113140207e-06, + "loss": 0.4652, + "step": 9140 + }, + { + "epoch": 0.71, + "grad_norm": 1.2378909581991357, + "learning_rate": 4.118285839203675e-06, + "loss": 0.5182, + "step": 9141 + }, + { + "epoch": 0.71, + "grad_norm": 0.9856735021138948, + "learning_rate": 4.116253936610107e-06, + "loss": 0.453, + "step": 9142 + }, + { + "epoch": 0.71, + "grad_norm": 1.2048638893752568, + "learning_rate": 4.114222405487781e-06, + "loss": 0.4992, + "step": 9143 + }, + { + "epoch": 0.71, + "grad_norm": 1.297629217501277, + "learning_rate": 4.112191245964962e-06, + "loss": 0.5592, + "step": 9144 + }, + { + "epoch": 0.71, + "grad_norm": 1.156058527583534, + "learning_rate": 4.110160458169888e-06, + "loss": 0.4844, + "step": 9145 + }, + { + "epoch": 0.71, + "grad_norm": 1.3217858214541571, + "learning_rate": 4.10813004223077e-06, + "loss": 0.5343, + "step": 9146 + }, + { + "epoch": 0.71, + "grad_norm": 1.253360379913028, + "learning_rate": 4.106099998275801e-06, + "loss": 0.5238, + "step": 9147 + }, + { + "epoch": 0.71, + "grad_norm": 1.2262744929893845, + "learning_rate": 4.104070326433146e-06, + "loss": 0.5616, + "step": 9148 + }, + { + "epoch": 0.71, + "grad_norm": 1.2276122345279603, + "learning_rate": 4.102041026830952e-06, + "loss": 0.5136, + "step": 9149 + }, + { + "epoch": 0.71, + "grad_norm": 1.212738027509996, + "learning_rate": 4.100012099597339e-06, + "loss": 0.5255, + "step": 9150 + }, + { + "epoch": 0.71, + "grad_norm": 1.1425299740288044, + "learning_rate": 4.0979835448604e-06, + "loss": 0.5023, + "step": 9151 + }, + { + "epoch": 0.71, + "grad_norm": 1.3176413608263036, + "learning_rate": 4.095955362748214e-06, + "loss": 0.5592, + "step": 9152 + }, + { + "epoch": 0.71, + "grad_norm": 1.127117178097935, + "learning_rate": 4.093927553388822e-06, + "loss": 0.5445, + "step": 9153 + }, + { + "epoch": 0.71, + "grad_norm": 1.1204352488501264, + "learning_rate": 4.091900116910256e-06, + "loss": 0.4846, + "step": 9154 + }, + { + "epoch": 0.71, + "grad_norm": 1.1651226055315875, + "learning_rate": 4.089873053440521e-06, + "loss": 0.4943, + "step": 9155 + }, + { + "epoch": 0.71, + "grad_norm": 1.2361898480506883, + "learning_rate": 4.087846363107588e-06, + "loss": 0.5198, + "step": 9156 + }, + { + "epoch": 0.71, + "grad_norm": 1.1158977613207686, + "learning_rate": 4.085820046039417e-06, + "loss": 0.5175, + "step": 9157 + }, + { + "epoch": 0.71, + "grad_norm": 1.2050143677895648, + "learning_rate": 4.083794102363939e-06, + "loss": 0.5262, + "step": 9158 + }, + { + "epoch": 0.71, + "grad_norm": 1.1567126714496982, + "learning_rate": 4.081768532209064e-06, + "loss": 0.5194, + "step": 9159 + }, + { + "epoch": 0.71, + "grad_norm": 1.2458617852490823, + "learning_rate": 4.079743335702679e-06, + "loss": 0.5365, + "step": 9160 + }, + { + "epoch": 0.71, + "grad_norm": 1.2723189514059556, + "learning_rate": 4.077718512972638e-06, + "loss": 0.5193, + "step": 9161 + }, + { + "epoch": 0.71, + "grad_norm": 1.1843739572485206, + "learning_rate": 4.075694064146786e-06, + "loss": 0.5017, + "step": 9162 + }, + { + "epoch": 0.71, + "grad_norm": 1.090513126352009, + "learning_rate": 4.07366998935293e-06, + "loss": 0.5133, + "step": 9163 + }, + { + "epoch": 0.71, + "grad_norm": 1.1978434277688532, + "learning_rate": 4.071646288718863e-06, + "loss": 0.5584, + "step": 9164 + }, + { + "epoch": 0.71, + "grad_norm": 1.280004733568618, + "learning_rate": 4.069622962372355e-06, + "loss": 0.5757, + "step": 9165 + }, + { + "epoch": 0.71, + "grad_norm": 1.147629849602213, + "learning_rate": 4.067600010441143e-06, + "loss": 0.5448, + "step": 9166 + }, + { + "epoch": 0.71, + "grad_norm": 1.1799589438488978, + "learning_rate": 4.06557743305295e-06, + "loss": 0.5137, + "step": 9167 + }, + { + "epoch": 0.71, + "grad_norm": 1.066421508679898, + "learning_rate": 4.06355523033547e-06, + "loss": 0.4865, + "step": 9168 + }, + { + "epoch": 0.71, + "grad_norm": 1.2605736322107632, + "learning_rate": 4.0615334024163775e-06, + "loss": 0.5355, + "step": 9169 + }, + { + "epoch": 0.71, + "grad_norm": 1.3299224359080555, + "learning_rate": 4.059511949423322e-06, + "loss": 0.5626, + "step": 9170 + }, + { + "epoch": 0.71, + "grad_norm": 1.1347075823385895, + "learning_rate": 4.0574908714839245e-06, + "loss": 0.4856, + "step": 9171 + }, + { + "epoch": 0.71, + "grad_norm": 1.2916612727555072, + "learning_rate": 4.05547016872579e-06, + "loss": 0.5258, + "step": 9172 + }, + { + "epoch": 0.71, + "grad_norm": 1.2433949486104487, + "learning_rate": 4.05344984127649e-06, + "loss": 0.5682, + "step": 9173 + }, + { + "epoch": 0.71, + "grad_norm": 1.1087062593108779, + "learning_rate": 4.051429889263582e-06, + "loss": 0.5214, + "step": 9174 + }, + { + "epoch": 0.71, + "grad_norm": 1.1999989807601414, + "learning_rate": 4.049410312814598e-06, + "loss": 0.5506, + "step": 9175 + }, + { + "epoch": 0.71, + "grad_norm": 1.1401144478887495, + "learning_rate": 4.0473911120570396e-06, + "loss": 0.5099, + "step": 9176 + }, + { + "epoch": 0.71, + "grad_norm": 1.1448725689436217, + "learning_rate": 4.045372287118391e-06, + "loss": 0.4827, + "step": 9177 + }, + { + "epoch": 0.71, + "grad_norm": 1.2162708106617781, + "learning_rate": 4.043353838126113e-06, + "loss": 0.533, + "step": 9178 + }, + { + "epoch": 0.71, + "grad_norm": 1.1807670453691057, + "learning_rate": 4.041335765207638e-06, + "loss": 0.4799, + "step": 9179 + }, + { + "epoch": 0.71, + "grad_norm": 1.1492767381592752, + "learning_rate": 4.039318068490383e-06, + "loss": 0.5103, + "step": 9180 + }, + { + "epoch": 0.71, + "grad_norm": 1.1679734960351158, + "learning_rate": 4.037300748101728e-06, + "loss": 0.5272, + "step": 9181 + }, + { + "epoch": 0.71, + "grad_norm": 1.1251872754517631, + "learning_rate": 4.0352838041690435e-06, + "loss": 0.52, + "step": 9182 + }, + { + "epoch": 0.71, + "grad_norm": 1.0471188844736952, + "learning_rate": 4.033267236819664e-06, + "loss": 0.4527, + "step": 9183 + }, + { + "epoch": 0.71, + "grad_norm": 1.2601790348648976, + "learning_rate": 4.031251046180906e-06, + "loss": 0.5334, + "step": 9184 + }, + { + "epoch": 0.71, + "grad_norm": 1.1409762116141424, + "learning_rate": 4.029235232380069e-06, + "loss": 0.4469, + "step": 9185 + }, + { + "epoch": 0.71, + "grad_norm": 1.2055794092143792, + "learning_rate": 4.027219795544413e-06, + "loss": 0.5227, + "step": 9186 + }, + { + "epoch": 0.71, + "grad_norm": 1.1711845398384093, + "learning_rate": 4.025204735801187e-06, + "loss": 0.5013, + "step": 9187 + }, + { + "epoch": 0.71, + "grad_norm": 1.2251611953867116, + "learning_rate": 4.023190053277612e-06, + "loss": 0.5534, + "step": 9188 + }, + { + "epoch": 0.71, + "grad_norm": 1.3760132957304065, + "learning_rate": 4.021175748100885e-06, + "loss": 0.546, + "step": 9189 + }, + { + "epoch": 0.71, + "grad_norm": 1.2826929108683494, + "learning_rate": 4.019161820398183e-06, + "loss": 0.5515, + "step": 9190 + }, + { + "epoch": 0.71, + "grad_norm": 1.2477387001536355, + "learning_rate": 4.017148270296652e-06, + "loss": 0.5109, + "step": 9191 + }, + { + "epoch": 0.71, + "grad_norm": 1.1340588257751902, + "learning_rate": 4.015135097923416e-06, + "loss": 0.4834, + "step": 9192 + }, + { + "epoch": 0.71, + "grad_norm": 1.1923391723145023, + "learning_rate": 4.013122303405579e-06, + "loss": 0.5053, + "step": 9193 + }, + { + "epoch": 0.71, + "grad_norm": 1.0859296510261816, + "learning_rate": 4.011109886870218e-06, + "loss": 0.4916, + "step": 9194 + }, + { + "epoch": 0.71, + "grad_norm": 1.188481577703572, + "learning_rate": 4.009097848444389e-06, + "loss": 0.4899, + "step": 9195 + }, + { + "epoch": 0.71, + "grad_norm": 1.2902014122683947, + "learning_rate": 4.007086188255125e-06, + "loss": 0.5312, + "step": 9196 + }, + { + "epoch": 0.71, + "grad_norm": 1.113974104660785, + "learning_rate": 4.005074906429426e-06, + "loss": 0.4828, + "step": 9197 + }, + { + "epoch": 0.71, + "grad_norm": 1.149717331437313, + "learning_rate": 4.003064003094278e-06, + "loss": 0.5427, + "step": 9198 + }, + { + "epoch": 0.71, + "grad_norm": 1.2601682980476445, + "learning_rate": 4.00105347837664e-06, + "loss": 0.4983, + "step": 9199 + }, + { + "epoch": 0.71, + "grad_norm": 1.2905802721304305, + "learning_rate": 3.99904333240345e-06, + "loss": 0.5543, + "step": 9200 + }, + { + "epoch": 0.71, + "grad_norm": 1.235720757964405, + "learning_rate": 3.9970335653016146e-06, + "loss": 0.5093, + "step": 9201 + }, + { + "epoch": 0.71, + "grad_norm": 1.1723006937589986, + "learning_rate": 3.995024177198018e-06, + "loss": 0.5109, + "step": 9202 + }, + { + "epoch": 0.71, + "grad_norm": 1.2080630241241732, + "learning_rate": 3.993015168219527e-06, + "loss": 0.5561, + "step": 9203 + }, + { + "epoch": 0.71, + "grad_norm": 1.1754927677336082, + "learning_rate": 3.991006538492981e-06, + "loss": 0.5244, + "step": 9204 + }, + { + "epoch": 0.71, + "grad_norm": 1.0976940433607125, + "learning_rate": 3.9889982881451924e-06, + "loss": 0.4569, + "step": 9205 + }, + { + "epoch": 0.71, + "grad_norm": 1.2162140603160938, + "learning_rate": 3.98699041730296e-06, + "loss": 0.5393, + "step": 9206 + }, + { + "epoch": 0.71, + "grad_norm": 1.292645094085734, + "learning_rate": 3.9849829260930416e-06, + "loss": 0.5385, + "step": 9207 + }, + { + "epoch": 0.71, + "grad_norm": 1.175047382454505, + "learning_rate": 3.982975814642185e-06, + "loss": 0.4959, + "step": 9208 + }, + { + "epoch": 0.71, + "grad_norm": 1.1190184414991386, + "learning_rate": 3.980969083077108e-06, + "loss": 0.4977, + "step": 9209 + }, + { + "epoch": 0.71, + "grad_norm": 1.242665756574865, + "learning_rate": 3.9789627315245115e-06, + "loss": 0.5725, + "step": 9210 + }, + { + "epoch": 0.71, + "grad_norm": 1.2050901440186441, + "learning_rate": 3.976956760111061e-06, + "loss": 0.4904, + "step": 9211 + }, + { + "epoch": 0.71, + "grad_norm": 1.2056426916015859, + "learning_rate": 3.9749511689634025e-06, + "loss": 0.4892, + "step": 9212 + }, + { + "epoch": 0.71, + "grad_norm": 1.13164091363705, + "learning_rate": 3.972945958208162e-06, + "loss": 0.5123, + "step": 9213 + }, + { + "epoch": 0.71, + "grad_norm": 1.222067840656587, + "learning_rate": 3.9709411279719375e-06, + "loss": 0.5824, + "step": 9214 + }, + { + "epoch": 0.71, + "grad_norm": 1.193208575671176, + "learning_rate": 3.968936678381307e-06, + "loss": 0.4893, + "step": 9215 + }, + { + "epoch": 0.71, + "grad_norm": 1.1840454347598417, + "learning_rate": 3.966932609562822e-06, + "loss": 0.5458, + "step": 9216 + }, + { + "epoch": 0.72, + "grad_norm": 1.1467082700186808, + "learning_rate": 3.964928921643006e-06, + "loss": 0.4631, + "step": 9217 + }, + { + "epoch": 0.72, + "grad_norm": 1.2393138442963556, + "learning_rate": 3.962925614748363e-06, + "loss": 0.5464, + "step": 9218 + }, + { + "epoch": 0.72, + "grad_norm": 1.3700323098960099, + "learning_rate": 3.960922689005373e-06, + "loss": 0.4768, + "step": 9219 + }, + { + "epoch": 0.72, + "grad_norm": 1.1401455537833538, + "learning_rate": 3.958920144540496e-06, + "loss": 0.4996, + "step": 9220 + }, + { + "epoch": 0.72, + "grad_norm": 1.1672620900129118, + "learning_rate": 3.956917981480156e-06, + "loss": 0.5197, + "step": 9221 + }, + { + "epoch": 0.72, + "grad_norm": 1.3190673735491731, + "learning_rate": 3.95491619995076e-06, + "loss": 0.5718, + "step": 9222 + }, + { + "epoch": 0.72, + "grad_norm": 1.1202619171428119, + "learning_rate": 3.952914800078693e-06, + "loss": 0.4645, + "step": 9223 + }, + { + "epoch": 0.72, + "grad_norm": 1.0950828605375833, + "learning_rate": 3.950913781990313e-06, + "loss": 0.4546, + "step": 9224 + }, + { + "epoch": 0.72, + "grad_norm": 1.1518040831950365, + "learning_rate": 3.948913145811956e-06, + "loss": 0.461, + "step": 9225 + }, + { + "epoch": 0.72, + "grad_norm": 1.2472318516288499, + "learning_rate": 3.946912891669934e-06, + "loss": 0.5452, + "step": 9226 + }, + { + "epoch": 0.72, + "grad_norm": 1.2215676138110405, + "learning_rate": 3.9449130196905275e-06, + "loss": 0.5369, + "step": 9227 + }, + { + "epoch": 0.72, + "grad_norm": 1.3032043158454456, + "learning_rate": 3.942913530000002e-06, + "loss": 0.555, + "step": 9228 + }, + { + "epoch": 0.72, + "grad_norm": 1.1938240407780623, + "learning_rate": 3.940914422724597e-06, + "loss": 0.5054, + "step": 9229 + }, + { + "epoch": 0.72, + "grad_norm": 1.1565339538794939, + "learning_rate": 3.938915697990528e-06, + "loss": 0.5335, + "step": 9230 + }, + { + "epoch": 0.72, + "grad_norm": 1.1029229387009631, + "learning_rate": 3.936917355923982e-06, + "loss": 0.5061, + "step": 9231 + }, + { + "epoch": 0.72, + "grad_norm": 1.8393031660460972, + "learning_rate": 3.934919396651121e-06, + "loss": 0.4768, + "step": 9232 + }, + { + "epoch": 0.72, + "grad_norm": 1.0909862477478638, + "learning_rate": 3.932921820298091e-06, + "loss": 0.4682, + "step": 9233 + }, + { + "epoch": 0.72, + "grad_norm": 1.1714377541215373, + "learning_rate": 3.930924626991008e-06, + "loss": 0.5307, + "step": 9234 + }, + { + "epoch": 0.72, + "grad_norm": 1.0746758060867339, + "learning_rate": 3.9289278168559665e-06, + "loss": 0.4797, + "step": 9235 + }, + { + "epoch": 0.72, + "grad_norm": 1.224787356935798, + "learning_rate": 3.926931390019036e-06, + "loss": 0.524, + "step": 9236 + }, + { + "epoch": 0.72, + "grad_norm": 1.2695473537890891, + "learning_rate": 3.9249353466062575e-06, + "loss": 0.479, + "step": 9237 + }, + { + "epoch": 0.72, + "grad_norm": 1.0773588581976077, + "learning_rate": 3.922939686743655e-06, + "loss": 0.4738, + "step": 9238 + }, + { + "epoch": 0.72, + "grad_norm": 1.1997496085399793, + "learning_rate": 3.920944410557222e-06, + "loss": 0.489, + "step": 9239 + }, + { + "epoch": 0.72, + "grad_norm": 1.3874839713055724, + "learning_rate": 3.918949518172936e-06, + "loss": 0.5511, + "step": 9240 + }, + { + "epoch": 0.72, + "grad_norm": 1.2583054234186295, + "learning_rate": 3.916955009716741e-06, + "loss": 0.5154, + "step": 9241 + }, + { + "epoch": 0.72, + "grad_norm": 1.2227689063482987, + "learning_rate": 3.914960885314557e-06, + "loss": 0.5448, + "step": 9242 + }, + { + "epoch": 0.72, + "grad_norm": 1.16173659043508, + "learning_rate": 3.912967145092288e-06, + "loss": 0.4664, + "step": 9243 + }, + { + "epoch": 0.72, + "grad_norm": 1.1343483871827158, + "learning_rate": 3.910973789175807e-06, + "loss": 0.4785, + "step": 9244 + }, + { + "epoch": 0.72, + "grad_norm": 1.2999481190819453, + "learning_rate": 3.908980817690966e-06, + "loss": 0.5031, + "step": 9245 + }, + { + "epoch": 0.72, + "grad_norm": 1.166828990043494, + "learning_rate": 3.906988230763592e-06, + "loss": 0.5191, + "step": 9246 + }, + { + "epoch": 0.72, + "grad_norm": 1.2320738964646103, + "learning_rate": 3.90499602851949e-06, + "loss": 0.5247, + "step": 9247 + }, + { + "epoch": 0.72, + "grad_norm": 1.0787309243116863, + "learning_rate": 3.903004211084431e-06, + "loss": 0.4801, + "step": 9248 + }, + { + "epoch": 0.72, + "grad_norm": 1.2106633029721567, + "learning_rate": 3.901012778584172e-06, + "loss": 0.5072, + "step": 9249 + }, + { + "epoch": 0.72, + "grad_norm": 1.07588100483352, + "learning_rate": 3.8990217311444475e-06, + "loss": 0.4942, + "step": 9250 + }, + { + "epoch": 0.72, + "grad_norm": 1.087377677264296, + "learning_rate": 3.897031068890954e-06, + "loss": 0.4642, + "step": 9251 + }, + { + "epoch": 0.72, + "grad_norm": 1.2158418829662845, + "learning_rate": 3.89504079194938e-06, + "loss": 0.5453, + "step": 9252 + }, + { + "epoch": 0.72, + "grad_norm": 1.2622034423304875, + "learning_rate": 3.893050900445375e-06, + "loss": 0.5472, + "step": 9253 + }, + { + "epoch": 0.72, + "grad_norm": 1.170307178765335, + "learning_rate": 3.891061394504575e-06, + "loss": 0.4958, + "step": 9254 + }, + { + "epoch": 0.72, + "grad_norm": 1.2740703952480115, + "learning_rate": 3.889072274252586e-06, + "loss": 0.5372, + "step": 9255 + }, + { + "epoch": 0.72, + "grad_norm": 1.0767860390907344, + "learning_rate": 3.887083539814993e-06, + "loss": 0.462, + "step": 9256 + }, + { + "epoch": 0.72, + "grad_norm": 1.1228507492638438, + "learning_rate": 3.885095191317357e-06, + "loss": 0.5027, + "step": 9257 + }, + { + "epoch": 0.72, + "grad_norm": 1.275574558425168, + "learning_rate": 3.883107228885209e-06, + "loss": 0.5603, + "step": 9258 + }, + { + "epoch": 0.72, + "grad_norm": 1.2504615885109909, + "learning_rate": 3.881119652644059e-06, + "loss": 0.5159, + "step": 9259 + }, + { + "epoch": 0.72, + "grad_norm": 1.2307179032223476, + "learning_rate": 3.8791324627193996e-06, + "loss": 0.5248, + "step": 9260 + }, + { + "epoch": 0.72, + "grad_norm": 1.2152022050641655, + "learning_rate": 3.877145659236682e-06, + "loss": 0.5123, + "step": 9261 + }, + { + "epoch": 0.72, + "grad_norm": 1.0877914520480494, + "learning_rate": 3.875159242321353e-06, + "loss": 0.4731, + "step": 9262 + }, + { + "epoch": 0.72, + "grad_norm": 1.258201728448149, + "learning_rate": 3.873173212098818e-06, + "loss": 0.5634, + "step": 9263 + }, + { + "epoch": 0.72, + "grad_norm": 1.1698818295460138, + "learning_rate": 3.871187568694468e-06, + "loss": 0.5068, + "step": 9264 + }, + { + "epoch": 0.72, + "grad_norm": 1.157321330507798, + "learning_rate": 3.869202312233668e-06, + "loss": 0.4622, + "step": 9265 + }, + { + "epoch": 0.72, + "grad_norm": 1.1319227210042582, + "learning_rate": 3.8672174428417555e-06, + "loss": 0.4614, + "step": 9266 + }, + { + "epoch": 0.72, + "grad_norm": 1.148809100385579, + "learning_rate": 3.865232960644051e-06, + "loss": 0.4694, + "step": 9267 + }, + { + "epoch": 0.72, + "grad_norm": 1.1638486524944238, + "learning_rate": 3.8632488657658375e-06, + "loss": 0.5079, + "step": 9268 + }, + { + "epoch": 0.72, + "grad_norm": 1.2258697789002149, + "learning_rate": 3.861265158332383e-06, + "loss": 0.508, + "step": 9269 + }, + { + "epoch": 0.72, + "grad_norm": 1.2231359546231242, + "learning_rate": 3.859281838468937e-06, + "loss": 0.494, + "step": 9270 + }, + { + "epoch": 0.72, + "grad_norm": 1.0851564411468009, + "learning_rate": 3.857298906300705e-06, + "loss": 0.5212, + "step": 9271 + }, + { + "epoch": 0.72, + "grad_norm": 1.1181771756531849, + "learning_rate": 3.85531636195289e-06, + "loss": 0.4744, + "step": 9272 + }, + { + "epoch": 0.72, + "grad_norm": 1.174684995672424, + "learning_rate": 3.8533342055506505e-06, + "loss": 0.5124, + "step": 9273 + }, + { + "epoch": 0.72, + "grad_norm": 1.2072320172393856, + "learning_rate": 3.851352437219137e-06, + "loss": 0.5216, + "step": 9274 + }, + { + "epoch": 0.72, + "grad_norm": 1.2233539577134949, + "learning_rate": 3.849371057083465e-06, + "loss": 0.4894, + "step": 9275 + }, + { + "epoch": 0.72, + "grad_norm": 1.0967186558255064, + "learning_rate": 3.8473900652687336e-06, + "loss": 0.4987, + "step": 9276 + }, + { + "epoch": 0.72, + "grad_norm": 1.1696979900748865, + "learning_rate": 3.845409461900012e-06, + "loss": 0.523, + "step": 9277 + }, + { + "epoch": 0.72, + "grad_norm": 1.2535209658025441, + "learning_rate": 3.843429247102343e-06, + "loss": 0.5261, + "step": 9278 + }, + { + "epoch": 0.72, + "grad_norm": 1.2239009383792616, + "learning_rate": 3.841449421000748e-06, + "loss": 0.5455, + "step": 9279 + }, + { + "epoch": 0.72, + "grad_norm": 1.2086710731908556, + "learning_rate": 3.839469983720229e-06, + "loss": 0.5539, + "step": 9280 + }, + { + "epoch": 0.72, + "grad_norm": 1.2962919556201942, + "learning_rate": 3.837490935385751e-06, + "loss": 0.521, + "step": 9281 + }, + { + "epoch": 0.72, + "grad_norm": 1.156273609642032, + "learning_rate": 3.835512276122267e-06, + "loss": 0.4906, + "step": 9282 + }, + { + "epoch": 0.72, + "grad_norm": 1.156572348535469, + "learning_rate": 3.833534006054694e-06, + "loss": 0.5028, + "step": 9283 + }, + { + "epoch": 0.72, + "grad_norm": 1.282507442207382, + "learning_rate": 3.8315561253079344e-06, + "loss": 0.5582, + "step": 9284 + }, + { + "epoch": 0.72, + "grad_norm": 1.1559097717375064, + "learning_rate": 3.829578634006862e-06, + "loss": 0.4893, + "step": 9285 + }, + { + "epoch": 0.72, + "grad_norm": 1.3159027042105154, + "learning_rate": 3.827601532276325e-06, + "loss": 0.5633, + "step": 9286 + }, + { + "epoch": 0.72, + "grad_norm": 1.0551041521847013, + "learning_rate": 3.825624820241153e-06, + "loss": 0.471, + "step": 9287 + }, + { + "epoch": 0.72, + "grad_norm": 1.2604334281038085, + "learning_rate": 3.823648498026138e-06, + "loss": 0.5181, + "step": 9288 + }, + { + "epoch": 0.72, + "grad_norm": 1.0864020801487757, + "learning_rate": 3.821672565756058e-06, + "loss": 0.4641, + "step": 9289 + }, + { + "epoch": 0.72, + "grad_norm": 1.2304104745779894, + "learning_rate": 3.81969702355567e-06, + "loss": 0.5294, + "step": 9290 + }, + { + "epoch": 0.72, + "grad_norm": 1.1659333728808234, + "learning_rate": 3.8177218715496915e-06, + "loss": 0.5302, + "step": 9291 + }, + { + "epoch": 0.72, + "grad_norm": 1.2711768181215732, + "learning_rate": 3.8157471098628295e-06, + "loss": 0.5153, + "step": 9292 + }, + { + "epoch": 0.72, + "grad_norm": 1.210046895986327, + "learning_rate": 3.8137727386197564e-06, + "loss": 0.4897, + "step": 9293 + }, + { + "epoch": 0.72, + "grad_norm": 1.110144496052831, + "learning_rate": 3.8117987579451275e-06, + "loss": 0.5018, + "step": 9294 + }, + { + "epoch": 0.72, + "grad_norm": 1.1120689885879511, + "learning_rate": 3.809825167963569e-06, + "loss": 0.4739, + "step": 9295 + }, + { + "epoch": 0.72, + "grad_norm": 1.2287830249776177, + "learning_rate": 3.807851968799685e-06, + "loss": 0.5394, + "step": 9296 + }, + { + "epoch": 0.72, + "grad_norm": 1.255823019162969, + "learning_rate": 3.8058791605780577e-06, + "loss": 0.5251, + "step": 9297 + }, + { + "epoch": 0.72, + "grad_norm": 1.2025825962242997, + "learning_rate": 3.8039067434232324e-06, + "loss": 0.5203, + "step": 9298 + }, + { + "epoch": 0.72, + "grad_norm": 1.182422625084933, + "learning_rate": 3.8019347174597454e-06, + "loss": 0.5131, + "step": 9299 + }, + { + "epoch": 0.72, + "grad_norm": 1.1508865194348052, + "learning_rate": 3.7999630828120947e-06, + "loss": 0.4717, + "step": 9300 + }, + { + "epoch": 0.72, + "grad_norm": 1.2077238694374473, + "learning_rate": 3.797991839604762e-06, + "loss": 0.4728, + "step": 9301 + }, + { + "epoch": 0.72, + "grad_norm": 1.1981971191287568, + "learning_rate": 3.7960209879622025e-06, + "loss": 0.5582, + "step": 9302 + }, + { + "epoch": 0.72, + "grad_norm": 1.2085117285131313, + "learning_rate": 3.79405052800885e-06, + "loss": 0.5187, + "step": 9303 + }, + { + "epoch": 0.72, + "grad_norm": 1.2688084819074847, + "learning_rate": 3.792080459869103e-06, + "loss": 0.5462, + "step": 9304 + }, + { + "epoch": 0.72, + "grad_norm": 1.157197615506893, + "learning_rate": 3.7901107836673444e-06, + "loss": 0.4585, + "step": 9305 + }, + { + "epoch": 0.72, + "grad_norm": 1.3535944047380466, + "learning_rate": 3.788141499527932e-06, + "loss": 0.5848, + "step": 9306 + }, + { + "epoch": 0.72, + "grad_norm": 1.2523252793994752, + "learning_rate": 3.7861726075751948e-06, + "loss": 0.5391, + "step": 9307 + }, + { + "epoch": 0.72, + "grad_norm": 1.130220644601081, + "learning_rate": 3.7842041079334446e-06, + "loss": 0.5103, + "step": 9308 + }, + { + "epoch": 0.72, + "grad_norm": 1.1395874007092592, + "learning_rate": 3.7822360007269564e-06, + "loss": 0.4914, + "step": 9309 + }, + { + "epoch": 0.72, + "grad_norm": 1.225183720338488, + "learning_rate": 3.780268286079988e-06, + "loss": 0.5236, + "step": 9310 + }, + { + "epoch": 0.72, + "grad_norm": 1.2772257339134647, + "learning_rate": 3.77830096411677e-06, + "loss": 0.5128, + "step": 9311 + }, + { + "epoch": 0.72, + "grad_norm": 1.1542111540849496, + "learning_rate": 3.776334034961513e-06, + "loss": 0.4983, + "step": 9312 + }, + { + "epoch": 0.72, + "grad_norm": 1.269094350904054, + "learning_rate": 3.7743674987384017e-06, + "loss": 0.5637, + "step": 9313 + }, + { + "epoch": 0.72, + "grad_norm": 1.1873309366443683, + "learning_rate": 3.7724013555715867e-06, + "loss": 0.4959, + "step": 9314 + }, + { + "epoch": 0.72, + "grad_norm": 1.1918699769112768, + "learning_rate": 3.7704356055852043e-06, + "loss": 0.4748, + "step": 9315 + }, + { + "epoch": 0.72, + "grad_norm": 1.225826018022242, + "learning_rate": 3.768470248903362e-06, + "loss": 0.549, + "step": 9316 + }, + { + "epoch": 0.72, + "grad_norm": 1.2046000863309725, + "learning_rate": 3.766505285650144e-06, + "loss": 0.5535, + "step": 9317 + }, + { + "epoch": 0.72, + "grad_norm": 1.3875861235687084, + "learning_rate": 3.7645407159496104e-06, + "loss": 0.4658, + "step": 9318 + }, + { + "epoch": 0.72, + "grad_norm": 1.1329643936925746, + "learning_rate": 3.762576539925793e-06, + "loss": 0.4972, + "step": 9319 + }, + { + "epoch": 0.72, + "grad_norm": 1.1519205641777288, + "learning_rate": 3.7606127577026965e-06, + "loss": 0.4955, + "step": 9320 + }, + { + "epoch": 0.72, + "grad_norm": 1.2569953206809021, + "learning_rate": 3.7586493694043068e-06, + "loss": 0.5415, + "step": 9321 + }, + { + "epoch": 0.72, + "grad_norm": 1.1525568878134829, + "learning_rate": 3.7566863751545833e-06, + "loss": 0.5547, + "step": 9322 + }, + { + "epoch": 0.72, + "grad_norm": 1.26928328219844, + "learning_rate": 3.7547237750774647e-06, + "loss": 0.6111, + "step": 9323 + }, + { + "epoch": 0.72, + "grad_norm": 1.2869445046862316, + "learning_rate": 3.7527615692968513e-06, + "loss": 0.5336, + "step": 9324 + }, + { + "epoch": 0.72, + "grad_norm": 1.2728515868503767, + "learning_rate": 3.7507997579366317e-06, + "loss": 0.5836, + "step": 9325 + }, + { + "epoch": 0.72, + "grad_norm": 1.2223000296959026, + "learning_rate": 3.7488383411206654e-06, + "loss": 0.4893, + "step": 9326 + }, + { + "epoch": 0.72, + "grad_norm": 1.1165511446358587, + "learning_rate": 3.7468773189727857e-06, + "loss": 0.4623, + "step": 9327 + }, + { + "epoch": 0.72, + "grad_norm": 1.2386484173370365, + "learning_rate": 3.744916691616807e-06, + "loss": 0.551, + "step": 9328 + }, + { + "epoch": 0.72, + "grad_norm": 1.1764325844011927, + "learning_rate": 3.742956459176508e-06, + "loss": 0.5148, + "step": 9329 + }, + { + "epoch": 0.72, + "grad_norm": 1.2103421624387882, + "learning_rate": 3.7409966217756477e-06, + "loss": 0.4758, + "step": 9330 + }, + { + "epoch": 0.72, + "grad_norm": 1.1721515583496773, + "learning_rate": 3.739037179537962e-06, + "loss": 0.5186, + "step": 9331 + }, + { + "epoch": 0.72, + "grad_norm": 1.2036209075252815, + "learning_rate": 3.737078132587163e-06, + "loss": 0.4936, + "step": 9332 + }, + { + "epoch": 0.72, + "grad_norm": 1.2664356225929865, + "learning_rate": 3.735119481046936e-06, + "loss": 0.5838, + "step": 9333 + }, + { + "epoch": 0.72, + "grad_norm": 1.1488216043265833, + "learning_rate": 3.7331612250409354e-06, + "loss": 0.5309, + "step": 9334 + }, + { + "epoch": 0.72, + "grad_norm": 1.2473215971686313, + "learning_rate": 3.7312033646928004e-06, + "loss": 0.5032, + "step": 9335 + }, + { + "epoch": 0.72, + "grad_norm": 1.1328500675680737, + "learning_rate": 3.7292459001261383e-06, + "loss": 0.5307, + "step": 9336 + }, + { + "epoch": 0.72, + "grad_norm": 1.1376216488021684, + "learning_rate": 3.7272888314645363e-06, + "loss": 0.4674, + "step": 9337 + }, + { + "epoch": 0.72, + "grad_norm": 1.0640978859102415, + "learning_rate": 3.725332158831556e-06, + "loss": 0.4742, + "step": 9338 + }, + { + "epoch": 0.72, + "grad_norm": 1.1563694350958473, + "learning_rate": 3.7233758823507303e-06, + "loss": 0.5684, + "step": 9339 + }, + { + "epoch": 0.72, + "grad_norm": 1.2073625523468285, + "learning_rate": 3.7214200021455647e-06, + "loss": 0.5513, + "step": 9340 + }, + { + "epoch": 0.72, + "grad_norm": 1.0869069334826291, + "learning_rate": 3.719464518339547e-06, + "loss": 0.4856, + "step": 9341 + }, + { + "epoch": 0.72, + "grad_norm": 1.206105936173928, + "learning_rate": 3.7175094310561375e-06, + "loss": 0.5413, + "step": 9342 + }, + { + "epoch": 0.72, + "grad_norm": 1.1111378699630299, + "learning_rate": 3.7155547404187754e-06, + "loss": 0.498, + "step": 9343 + }, + { + "epoch": 0.72, + "grad_norm": 1.177880273388775, + "learning_rate": 3.7136004465508624e-06, + "loss": 0.5172, + "step": 9344 + }, + { + "epoch": 0.72, + "grad_norm": 1.2648978322502944, + "learning_rate": 3.711646549575786e-06, + "loss": 0.5064, + "step": 9345 + }, + { + "epoch": 0.73, + "grad_norm": 1.184756623638608, + "learning_rate": 3.709693049616907e-06, + "loss": 0.546, + "step": 9346 + }, + { + "epoch": 0.73, + "grad_norm": 1.1899467659110678, + "learning_rate": 3.7077399467975594e-06, + "loss": 0.5459, + "step": 9347 + }, + { + "epoch": 0.73, + "grad_norm": 1.0921243302791952, + "learning_rate": 3.7057872412410566e-06, + "loss": 0.5071, + "step": 9348 + }, + { + "epoch": 0.73, + "grad_norm": 1.1971435484638682, + "learning_rate": 3.703834933070679e-06, + "loss": 0.5243, + "step": 9349 + }, + { + "epoch": 0.73, + "grad_norm": 1.1763220775707472, + "learning_rate": 3.7018830224096824e-06, + "loss": 0.5178, + "step": 9350 + }, + { + "epoch": 0.73, + "grad_norm": 1.102764582988765, + "learning_rate": 3.6999315093813048e-06, + "loss": 0.475, + "step": 9351 + }, + { + "epoch": 0.73, + "grad_norm": 1.2368553929560284, + "learning_rate": 3.6979803941087546e-06, + "loss": 0.5676, + "step": 9352 + }, + { + "epoch": 0.73, + "grad_norm": 1.1846536353887762, + "learning_rate": 3.696029676715216e-06, + "loss": 0.5264, + "step": 9353 + }, + { + "epoch": 0.73, + "grad_norm": 1.159076072220522, + "learning_rate": 3.694079357323853e-06, + "loss": 0.5438, + "step": 9354 + }, + { + "epoch": 0.73, + "grad_norm": 1.1696867794164079, + "learning_rate": 3.69212943605779e-06, + "loss": 0.5435, + "step": 9355 + }, + { + "epoch": 0.73, + "grad_norm": 1.3939657052937475, + "learning_rate": 3.6901799130401393e-06, + "loss": 0.5959, + "step": 9356 + }, + { + "epoch": 0.73, + "grad_norm": 1.1136429048675198, + "learning_rate": 3.688230788393986e-06, + "loss": 0.5107, + "step": 9357 + }, + { + "epoch": 0.73, + "grad_norm": 1.1645852265771934, + "learning_rate": 3.6862820622423913e-06, + "loss": 0.4796, + "step": 9358 + }, + { + "epoch": 0.73, + "grad_norm": 1.2764804267413663, + "learning_rate": 3.684333734708384e-06, + "loss": 0.5308, + "step": 9359 + }, + { + "epoch": 0.73, + "grad_norm": 1.157024845618142, + "learning_rate": 3.68238580591497e-06, + "loss": 0.4817, + "step": 9360 + }, + { + "epoch": 0.73, + "grad_norm": 1.194352506209174, + "learning_rate": 3.680438275985133e-06, + "loss": 0.536, + "step": 9361 + }, + { + "epoch": 0.73, + "grad_norm": 1.2729384959961452, + "learning_rate": 3.6784911450418337e-06, + "loss": 0.5242, + "step": 9362 + }, + { + "epoch": 0.73, + "grad_norm": 1.2697319928428525, + "learning_rate": 3.676544413208002e-06, + "loss": 0.5193, + "step": 9363 + }, + { + "epoch": 0.73, + "grad_norm": 1.1208878703598866, + "learning_rate": 3.6745980806065507e-06, + "loss": 0.5217, + "step": 9364 + }, + { + "epoch": 0.73, + "grad_norm": 1.2064323544404632, + "learning_rate": 3.6726521473603525e-06, + "loss": 0.5193, + "step": 9365 + }, + { + "epoch": 0.73, + "grad_norm": 1.1178337308868647, + "learning_rate": 3.670706613592271e-06, + "loss": 0.4714, + "step": 9366 + }, + { + "epoch": 0.73, + "grad_norm": 1.2075579336733664, + "learning_rate": 3.6687614794251348e-06, + "loss": 0.4973, + "step": 9367 + }, + { + "epoch": 0.73, + "grad_norm": 1.2252114989204077, + "learning_rate": 3.6668167449817548e-06, + "loss": 0.5328, + "step": 9368 + }, + { + "epoch": 0.73, + "grad_norm": 1.0726376636781376, + "learning_rate": 3.6648724103849086e-06, + "loss": 0.5084, + "step": 9369 + }, + { + "epoch": 0.73, + "grad_norm": 1.2401961193041842, + "learning_rate": 3.662928475757348e-06, + "loss": 0.54, + "step": 9370 + }, + { + "epoch": 0.73, + "grad_norm": 1.196323989046907, + "learning_rate": 3.6609849412218092e-06, + "loss": 0.5113, + "step": 9371 + }, + { + "epoch": 0.73, + "grad_norm": 1.068363894046801, + "learning_rate": 3.6590418069009947e-06, + "loss": 0.5007, + "step": 9372 + }, + { + "epoch": 0.73, + "grad_norm": 1.1908152130453231, + "learning_rate": 3.657099072917587e-06, + "loss": 0.4856, + "step": 9373 + }, + { + "epoch": 0.73, + "grad_norm": 1.080447418377442, + "learning_rate": 3.6551567393942422e-06, + "loss": 0.5003, + "step": 9374 + }, + { + "epoch": 0.73, + "grad_norm": 1.3184209964620741, + "learning_rate": 3.6532148064535855e-06, + "loss": 0.5416, + "step": 9375 + }, + { + "epoch": 0.73, + "grad_norm": 1.1106599699835906, + "learning_rate": 3.6512732742182223e-06, + "loss": 0.4943, + "step": 9376 + }, + { + "epoch": 0.73, + "grad_norm": 1.2009596722491962, + "learning_rate": 3.649332142810732e-06, + "loss": 0.5597, + "step": 9377 + }, + { + "epoch": 0.73, + "grad_norm": 1.1328043970278625, + "learning_rate": 3.6473914123536725e-06, + "loss": 0.4907, + "step": 9378 + }, + { + "epoch": 0.73, + "grad_norm": 1.1424279269100082, + "learning_rate": 3.645451082969569e-06, + "loss": 0.4784, + "step": 9379 + }, + { + "epoch": 0.73, + "grad_norm": 1.282054323015519, + "learning_rate": 3.64351115478092e-06, + "loss": 0.5729, + "step": 9380 + }, + { + "epoch": 0.73, + "grad_norm": 1.1911864030959958, + "learning_rate": 3.6415716279102065e-06, + "loss": 0.4772, + "step": 9381 + }, + { + "epoch": 0.73, + "grad_norm": 1.2475869729832347, + "learning_rate": 3.6396325024798817e-06, + "loss": 0.5345, + "step": 9382 + }, + { + "epoch": 0.73, + "grad_norm": 1.1954470789327094, + "learning_rate": 3.6376937786123722e-06, + "loss": 0.4915, + "step": 9383 + }, + { + "epoch": 0.73, + "grad_norm": 1.1322935823192142, + "learning_rate": 3.6357554564300824e-06, + "loss": 0.4622, + "step": 9384 + }, + { + "epoch": 0.73, + "grad_norm": 1.1771800696308359, + "learning_rate": 3.6338175360553827e-06, + "loss": 0.59, + "step": 9385 + }, + { + "epoch": 0.73, + "grad_norm": 1.1472558929996926, + "learning_rate": 3.6318800176106283e-06, + "loss": 0.524, + "step": 9386 + }, + { + "epoch": 0.73, + "grad_norm": 1.2529121808013093, + "learning_rate": 3.629942901218142e-06, + "loss": 0.5223, + "step": 9387 + }, + { + "epoch": 0.73, + "grad_norm": 1.1593039107827388, + "learning_rate": 3.6280061870002303e-06, + "loss": 0.5269, + "step": 9388 + }, + { + "epoch": 0.73, + "grad_norm": 1.1656866328331474, + "learning_rate": 3.6260698750791624e-06, + "loss": 0.4952, + "step": 9389 + }, + { + "epoch": 0.73, + "grad_norm": 1.239932045920407, + "learning_rate": 3.6241339655771844e-06, + "loss": 0.5095, + "step": 9390 + }, + { + "epoch": 0.73, + "grad_norm": 1.131180264763077, + "learning_rate": 3.6221984586165247e-06, + "loss": 0.4894, + "step": 9391 + }, + { + "epoch": 0.73, + "grad_norm": 1.1253704944466592, + "learning_rate": 3.620263354319382e-06, + "loss": 0.5021, + "step": 9392 + }, + { + "epoch": 0.73, + "grad_norm": 1.2463284932080216, + "learning_rate": 3.6183286528079287e-06, + "loss": 0.5387, + "step": 9393 + }, + { + "epoch": 0.73, + "grad_norm": 1.1575249524650533, + "learning_rate": 3.6163943542043156e-06, + "loss": 0.464, + "step": 9394 + }, + { + "epoch": 0.73, + "grad_norm": 1.2383408396729982, + "learning_rate": 3.61446045863066e-06, + "loss": 0.5171, + "step": 9395 + }, + { + "epoch": 0.73, + "grad_norm": 1.238486769314854, + "learning_rate": 3.612526966209059e-06, + "loss": 0.529, + "step": 9396 + }, + { + "epoch": 0.73, + "grad_norm": 1.2254701840879783, + "learning_rate": 3.6105938770615902e-06, + "loss": 0.5012, + "step": 9397 + }, + { + "epoch": 0.73, + "grad_norm": 1.1532174112150761, + "learning_rate": 3.608661191310291e-06, + "loss": 0.4944, + "step": 9398 + }, + { + "epoch": 0.73, + "grad_norm": 1.2398912811302558, + "learning_rate": 3.6067289090771883e-06, + "loss": 0.5186, + "step": 9399 + }, + { + "epoch": 0.73, + "grad_norm": 1.2043771544409003, + "learning_rate": 3.6047970304842727e-06, + "loss": 0.5218, + "step": 9400 + }, + { + "epoch": 0.73, + "grad_norm": 1.207110405459133, + "learning_rate": 3.6028655556535142e-06, + "loss": 0.4893, + "step": 9401 + }, + { + "epoch": 0.73, + "grad_norm": 1.232905756876433, + "learning_rate": 3.600934484706858e-06, + "loss": 0.506, + "step": 9402 + }, + { + "epoch": 0.73, + "grad_norm": 1.203818579569143, + "learning_rate": 3.5990038177662234e-06, + "loss": 0.5199, + "step": 9403 + }, + { + "epoch": 0.73, + "grad_norm": 1.1479331225942453, + "learning_rate": 3.5970735549535065e-06, + "loss": 0.4979, + "step": 9404 + }, + { + "epoch": 0.73, + "grad_norm": 2.278866293278656, + "learning_rate": 3.5951436963905663e-06, + "loss": 0.5025, + "step": 9405 + }, + { + "epoch": 0.73, + "grad_norm": 1.2012205036523376, + "learning_rate": 3.59321424219925e-06, + "loss": 0.5143, + "step": 9406 + }, + { + "epoch": 0.73, + "grad_norm": 1.2108788260734868, + "learning_rate": 3.591285192501376e-06, + "loss": 0.5444, + "step": 9407 + }, + { + "epoch": 0.73, + "grad_norm": 1.1544723764603242, + "learning_rate": 3.58935654741873e-06, + "loss": 0.4352, + "step": 9408 + }, + { + "epoch": 0.73, + "grad_norm": 1.148496509515144, + "learning_rate": 3.5874283070730787e-06, + "loss": 0.4986, + "step": 9409 + }, + { + "epoch": 0.73, + "grad_norm": 1.1626005272827842, + "learning_rate": 3.585500471586166e-06, + "loss": 0.4759, + "step": 9410 + }, + { + "epoch": 0.73, + "grad_norm": 1.2613769635207994, + "learning_rate": 3.583573041079701e-06, + "loss": 0.5354, + "step": 9411 + }, + { + "epoch": 0.73, + "grad_norm": 1.2848791038105478, + "learning_rate": 3.581646015675374e-06, + "loss": 0.5735, + "step": 9412 + }, + { + "epoch": 0.73, + "grad_norm": 1.2028704782746054, + "learning_rate": 3.579719395494847e-06, + "loss": 0.5234, + "step": 9413 + }, + { + "epoch": 0.73, + "grad_norm": 1.266301056942487, + "learning_rate": 3.577793180659761e-06, + "loss": 0.5607, + "step": 9414 + }, + { + "epoch": 0.73, + "grad_norm": 1.194764504072646, + "learning_rate": 3.575867371291728e-06, + "loss": 0.5197, + "step": 9415 + }, + { + "epoch": 0.73, + "grad_norm": 1.2372307879532023, + "learning_rate": 3.5739419675123275e-06, + "loss": 0.5199, + "step": 9416 + }, + { + "epoch": 0.73, + "grad_norm": 1.2479295750089943, + "learning_rate": 3.5720169694431294e-06, + "loss": 0.5336, + "step": 9417 + }, + { + "epoch": 0.73, + "grad_norm": 1.1309637835173993, + "learning_rate": 3.5700923772056606e-06, + "loss": 0.5064, + "step": 9418 + }, + { + "epoch": 0.73, + "grad_norm": 1.1952385973056323, + "learning_rate": 3.5681681909214338e-06, + "loss": 0.5145, + "step": 9419 + }, + { + "epoch": 0.73, + "grad_norm": 1.1421187039594531, + "learning_rate": 3.5662444107119365e-06, + "loss": 0.5244, + "step": 9420 + }, + { + "epoch": 0.73, + "grad_norm": 1.2065019650749584, + "learning_rate": 3.5643210366986205e-06, + "loss": 0.4596, + "step": 9421 + }, + { + "epoch": 0.73, + "grad_norm": 1.217841616804464, + "learning_rate": 3.5623980690029202e-06, + "loss": 0.5398, + "step": 9422 + }, + { + "epoch": 0.73, + "grad_norm": 1.185905238983062, + "learning_rate": 3.560475507746244e-06, + "loss": 0.4767, + "step": 9423 + }, + { + "epoch": 0.73, + "grad_norm": 1.2575858723497246, + "learning_rate": 3.5585533530499726e-06, + "loss": 0.585, + "step": 9424 + }, + { + "epoch": 0.73, + "grad_norm": 1.1313473928267659, + "learning_rate": 3.556631605035464e-06, + "loss": 0.4843, + "step": 9425 + }, + { + "epoch": 0.73, + "grad_norm": 1.1414290429149618, + "learning_rate": 3.554710263824043e-06, + "loss": 0.5394, + "step": 9426 + }, + { + "epoch": 0.73, + "grad_norm": 1.2171740490778613, + "learning_rate": 3.5527893295370196e-06, + "loss": 0.5175, + "step": 9427 + }, + { + "epoch": 0.73, + "grad_norm": 1.1237341328689883, + "learning_rate": 3.550868802295666e-06, + "loss": 0.4835, + "step": 9428 + }, + { + "epoch": 0.73, + "grad_norm": 1.2128348468234456, + "learning_rate": 3.548948682221238e-06, + "loss": 0.5069, + "step": 9429 + }, + { + "epoch": 0.73, + "grad_norm": 1.1712848957770694, + "learning_rate": 3.547028969434966e-06, + "loss": 0.5175, + "step": 9430 + }, + { + "epoch": 0.73, + "grad_norm": 1.1700183147235923, + "learning_rate": 3.545109664058044e-06, + "loss": 0.4947, + "step": 9431 + }, + { + "epoch": 0.73, + "grad_norm": 1.1560020696463091, + "learning_rate": 3.5431907662116528e-06, + "loss": 0.4864, + "step": 9432 + }, + { + "epoch": 0.73, + "grad_norm": 1.1559592732082835, + "learning_rate": 3.5412722760169403e-06, + "loss": 0.4897, + "step": 9433 + }, + { + "epoch": 0.73, + "grad_norm": 1.2422307804652986, + "learning_rate": 3.5393541935950327e-06, + "loss": 0.4787, + "step": 9434 + }, + { + "epoch": 0.73, + "grad_norm": 1.1150600744751, + "learning_rate": 3.53743651906703e-06, + "loss": 0.4936, + "step": 9435 + }, + { + "epoch": 0.73, + "grad_norm": 1.2099919226668443, + "learning_rate": 3.5355192525539996e-06, + "loss": 0.5095, + "step": 9436 + }, + { + "epoch": 0.73, + "grad_norm": 1.1187619938553792, + "learning_rate": 3.5336023941769947e-06, + "loss": 0.4575, + "step": 9437 + }, + { + "epoch": 0.73, + "grad_norm": 1.186138376304257, + "learning_rate": 3.5316859440570284e-06, + "loss": 0.5227, + "step": 9438 + }, + { + "epoch": 0.73, + "grad_norm": 1.2782591359979427, + "learning_rate": 3.5297699023151013e-06, + "loss": 0.5308, + "step": 9439 + }, + { + "epoch": 0.73, + "grad_norm": 1.1651090487413547, + "learning_rate": 3.527854269072186e-06, + "loss": 0.5115, + "step": 9440 + }, + { + "epoch": 0.73, + "grad_norm": 1.3675091174447633, + "learning_rate": 3.525939044449218e-06, + "loss": 0.5431, + "step": 9441 + }, + { + "epoch": 0.73, + "grad_norm": 1.346361284464671, + "learning_rate": 3.524024228567121e-06, + "loss": 0.5604, + "step": 9442 + }, + { + "epoch": 0.73, + "grad_norm": 1.3062517759320174, + "learning_rate": 3.5221098215467852e-06, + "loss": 0.561, + "step": 9443 + }, + { + "epoch": 0.73, + "grad_norm": 1.0930372368035426, + "learning_rate": 3.520195823509078e-06, + "loss": 0.4577, + "step": 9444 + }, + { + "epoch": 0.73, + "grad_norm": 1.2473986259965422, + "learning_rate": 3.518282234574845e-06, + "loss": 0.5526, + "step": 9445 + }, + { + "epoch": 0.73, + "grad_norm": 1.1495726288844041, + "learning_rate": 3.5163690548648898e-06, + "loss": 0.4712, + "step": 9446 + }, + { + "epoch": 0.73, + "grad_norm": 1.2018343455500333, + "learning_rate": 3.514456284500012e-06, + "loss": 0.5224, + "step": 9447 + }, + { + "epoch": 0.73, + "grad_norm": 1.2390626674041059, + "learning_rate": 3.5125439236009674e-06, + "loss": 0.4882, + "step": 9448 + }, + { + "epoch": 0.73, + "grad_norm": 1.2288681034053315, + "learning_rate": 3.510631972288494e-06, + "loss": 0.5326, + "step": 9449 + }, + { + "epoch": 0.73, + "grad_norm": 1.220903059798556, + "learning_rate": 3.508720430683309e-06, + "loss": 0.5283, + "step": 9450 + }, + { + "epoch": 0.73, + "grad_norm": 1.289878263421879, + "learning_rate": 3.5068092989060907e-06, + "loss": 0.5671, + "step": 9451 + }, + { + "epoch": 0.73, + "grad_norm": 1.2561371349287989, + "learning_rate": 3.504898577077502e-06, + "loss": 0.5717, + "step": 9452 + }, + { + "epoch": 0.73, + "grad_norm": 1.1130103065993753, + "learning_rate": 3.502988265318176e-06, + "loss": 0.5147, + "step": 9453 + }, + { + "epoch": 0.73, + "grad_norm": 1.1680463682174476, + "learning_rate": 3.501078363748721e-06, + "loss": 0.5658, + "step": 9454 + }, + { + "epoch": 0.73, + "grad_norm": 1.1959511042076074, + "learning_rate": 3.4991688724897223e-06, + "loss": 0.5251, + "step": 9455 + }, + { + "epoch": 0.73, + "grad_norm": 1.2124657616254628, + "learning_rate": 3.49725979166173e-06, + "loss": 0.5339, + "step": 9456 + }, + { + "epoch": 0.73, + "grad_norm": 1.2430410271807268, + "learning_rate": 3.4953511213852785e-06, + "loss": 0.5359, + "step": 9457 + }, + { + "epoch": 0.73, + "grad_norm": 1.2150701085024065, + "learning_rate": 3.493442861780868e-06, + "loss": 0.5163, + "step": 9458 + }, + { + "epoch": 0.73, + "grad_norm": 1.179041755224835, + "learning_rate": 3.4915350129689798e-06, + "loss": 0.5008, + "step": 9459 + }, + { + "epoch": 0.73, + "grad_norm": 1.2507684253080418, + "learning_rate": 3.489627575070068e-06, + "loss": 0.5181, + "step": 9460 + }, + { + "epoch": 0.73, + "grad_norm": 1.153933860123063, + "learning_rate": 3.487720548204553e-06, + "loss": 0.4959, + "step": 9461 + }, + { + "epoch": 0.73, + "grad_norm": 1.1792388972523369, + "learning_rate": 3.4858139324928388e-06, + "loss": 0.5336, + "step": 9462 + }, + { + "epoch": 0.73, + "grad_norm": 1.2206944335628083, + "learning_rate": 3.4839077280553e-06, + "loss": 0.5073, + "step": 9463 + }, + { + "epoch": 0.73, + "grad_norm": 1.0341639531048983, + "learning_rate": 3.4820019350122847e-06, + "loss": 0.462, + "step": 9464 + }, + { + "epoch": 0.73, + "grad_norm": 1.1788765349286616, + "learning_rate": 3.4800965534841158e-06, + "loss": 0.538, + "step": 9465 + }, + { + "epoch": 0.73, + "grad_norm": 1.1516457722963354, + "learning_rate": 3.4781915835910927e-06, + "loss": 0.4807, + "step": 9466 + }, + { + "epoch": 0.73, + "grad_norm": 1.1233892565729153, + "learning_rate": 3.476287025453484e-06, + "loss": 0.5268, + "step": 9467 + }, + { + "epoch": 0.73, + "grad_norm": 1.2700723287306177, + "learning_rate": 3.474382879191529e-06, + "loss": 0.5311, + "step": 9468 + }, + { + "epoch": 0.73, + "grad_norm": 1.0271704349893325, + "learning_rate": 3.472479144925451e-06, + "loss": 0.4595, + "step": 9469 + }, + { + "epoch": 0.73, + "grad_norm": 1.3595503770670667, + "learning_rate": 3.4705758227754426e-06, + "loss": 0.5599, + "step": 9470 + }, + { + "epoch": 0.73, + "grad_norm": 1.125043974122935, + "learning_rate": 3.4686729128616726e-06, + "loss": 0.5038, + "step": 9471 + }, + { + "epoch": 0.73, + "grad_norm": 1.0696727204924688, + "learning_rate": 3.4667704153042758e-06, + "loss": 0.4675, + "step": 9472 + }, + { + "epoch": 0.73, + "grad_norm": 1.1897369697532185, + "learning_rate": 3.46486833022337e-06, + "loss": 0.5165, + "step": 9473 + }, + { + "epoch": 0.73, + "grad_norm": 1.167925728608404, + "learning_rate": 3.462966657739042e-06, + "loss": 0.5232, + "step": 9474 + }, + { + "epoch": 0.74, + "grad_norm": 1.187137699071726, + "learning_rate": 3.461065397971357e-06, + "loss": 0.4585, + "step": 9475 + }, + { + "epoch": 0.74, + "grad_norm": 1.2564532594742266, + "learning_rate": 3.4591645510403528e-06, + "loss": 0.6012, + "step": 9476 + }, + { + "epoch": 0.74, + "grad_norm": 1.213434954141269, + "learning_rate": 3.457264117066037e-06, + "loss": 0.4945, + "step": 9477 + }, + { + "epoch": 0.74, + "grad_norm": 1.0629648986683833, + "learning_rate": 3.45536409616839e-06, + "loss": 0.4273, + "step": 9478 + }, + { + "epoch": 0.74, + "grad_norm": 1.2954013218813911, + "learning_rate": 3.453464488467373e-06, + "loss": 0.5355, + "step": 9479 + }, + { + "epoch": 0.74, + "grad_norm": 1.1615639825277282, + "learning_rate": 3.4515652940829192e-06, + "loss": 0.452, + "step": 9480 + }, + { + "epoch": 0.74, + "grad_norm": 1.1676074333507098, + "learning_rate": 3.449666513134937e-06, + "loss": 0.4498, + "step": 9481 + }, + { + "epoch": 0.74, + "grad_norm": 1.1809245312098, + "learning_rate": 3.4477681457433e-06, + "loss": 0.5256, + "step": 9482 + }, + { + "epoch": 0.74, + "grad_norm": 1.2467951698848505, + "learning_rate": 3.4458701920278646e-06, + "loss": 0.5342, + "step": 9483 + }, + { + "epoch": 0.74, + "grad_norm": 1.1513420276950452, + "learning_rate": 3.4439726521084595e-06, + "loss": 0.5352, + "step": 9484 + }, + { + "epoch": 0.74, + "grad_norm": 1.2094466846127767, + "learning_rate": 3.4420755261048843e-06, + "loss": 0.4968, + "step": 9485 + }, + { + "epoch": 0.74, + "grad_norm": 1.2738476894154822, + "learning_rate": 3.4401788141369196e-06, + "loss": 0.5073, + "step": 9486 + }, + { + "epoch": 0.74, + "grad_norm": 1.28192293126791, + "learning_rate": 3.4382825163243106e-06, + "loss": 0.5609, + "step": 9487 + }, + { + "epoch": 0.74, + "grad_norm": 1.0975383545901471, + "learning_rate": 3.4363866327867768e-06, + "loss": 0.5158, + "step": 9488 + }, + { + "epoch": 0.74, + "grad_norm": 1.360212221697563, + "learning_rate": 3.434491163644019e-06, + "loss": 0.5538, + "step": 9489 + }, + { + "epoch": 0.74, + "grad_norm": 1.0967503946937076, + "learning_rate": 3.432596109015708e-06, + "loss": 0.5207, + "step": 9490 + }, + { + "epoch": 0.74, + "grad_norm": 1.2211018879937154, + "learning_rate": 3.430701469021491e-06, + "loss": 0.5025, + "step": 9491 + }, + { + "epoch": 0.74, + "grad_norm": 1.1892160263902438, + "learning_rate": 3.4288072437809794e-06, + "loss": 0.498, + "step": 9492 + }, + { + "epoch": 0.74, + "grad_norm": 1.2119828664966084, + "learning_rate": 3.4269134334137698e-06, + "loss": 0.5662, + "step": 9493 + }, + { + "epoch": 0.74, + "grad_norm": 1.2544732163794503, + "learning_rate": 3.4250200380394284e-06, + "loss": 0.5173, + "step": 9494 + }, + { + "epoch": 0.74, + "grad_norm": 1.2514662725862404, + "learning_rate": 3.4231270577774976e-06, + "loss": 0.5239, + "step": 9495 + }, + { + "epoch": 0.74, + "grad_norm": 1.1575979160064678, + "learning_rate": 3.421234492747484e-06, + "loss": 0.5165, + "step": 9496 + }, + { + "epoch": 0.74, + "grad_norm": 1.305615078243322, + "learning_rate": 3.419342343068882e-06, + "loss": 0.5665, + "step": 9497 + }, + { + "epoch": 0.74, + "grad_norm": 1.2087566796071885, + "learning_rate": 3.417450608861147e-06, + "loss": 0.4663, + "step": 9498 + }, + { + "epoch": 0.74, + "grad_norm": 1.1930274814020905, + "learning_rate": 3.4155592902437162e-06, + "loss": 0.5326, + "step": 9499 + }, + { + "epoch": 0.74, + "grad_norm": 1.2406117743644756, + "learning_rate": 3.4136683873359987e-06, + "loss": 0.5467, + "step": 9500 + }, + { + "epoch": 0.74, + "grad_norm": 1.2319065473553306, + "learning_rate": 3.4117779002573803e-06, + "loss": 0.4889, + "step": 9501 + }, + { + "epoch": 0.74, + "grad_norm": 1.2642127269600925, + "learning_rate": 3.40988782912721e-06, + "loss": 0.5367, + "step": 9502 + }, + { + "epoch": 0.74, + "grad_norm": 1.2914795842611744, + "learning_rate": 3.4079981740648215e-06, + "loss": 0.5291, + "step": 9503 + }, + { + "epoch": 0.74, + "grad_norm": 1.2091311364153738, + "learning_rate": 3.406108935189519e-06, + "loss": 0.5409, + "step": 9504 + }, + { + "epoch": 0.74, + "grad_norm": 1.199552160381181, + "learning_rate": 3.404220112620583e-06, + "loss": 0.446, + "step": 9505 + }, + { + "epoch": 0.74, + "grad_norm": 1.2489497541059247, + "learning_rate": 3.402331706477258e-06, + "loss": 0.5278, + "step": 9506 + }, + { + "epoch": 0.74, + "grad_norm": 1.151139124316079, + "learning_rate": 3.400443716878774e-06, + "loss": 0.4736, + "step": 9507 + }, + { + "epoch": 0.74, + "grad_norm": 1.2149511455399715, + "learning_rate": 3.398556143944325e-06, + "loss": 0.5619, + "step": 9508 + }, + { + "epoch": 0.74, + "grad_norm": 1.2118997503844922, + "learning_rate": 3.3966689877930857e-06, + "loss": 0.5061, + "step": 9509 + }, + { + "epoch": 0.74, + "grad_norm": 1.1959842464819288, + "learning_rate": 3.394782248544202e-06, + "loss": 0.5532, + "step": 9510 + }, + { + "epoch": 0.74, + "grad_norm": 1.2025265877494704, + "learning_rate": 3.392895926316795e-06, + "loss": 0.5236, + "step": 9511 + }, + { + "epoch": 0.74, + "grad_norm": 1.0676761653365265, + "learning_rate": 3.3910100212299547e-06, + "loss": 0.4885, + "step": 9512 + }, + { + "epoch": 0.74, + "grad_norm": 1.189237227365878, + "learning_rate": 3.3891245334027487e-06, + "loss": 0.5269, + "step": 9513 + }, + { + "epoch": 0.74, + "grad_norm": 1.3206982472331639, + "learning_rate": 3.387239462954219e-06, + "loss": 0.5313, + "step": 9514 + }, + { + "epoch": 0.74, + "grad_norm": 1.2133309613892842, + "learning_rate": 3.385354810003383e-06, + "loss": 0.4849, + "step": 9515 + }, + { + "epoch": 0.74, + "grad_norm": 1.2180806302503446, + "learning_rate": 3.383470574669222e-06, + "loss": 0.5528, + "step": 9516 + }, + { + "epoch": 0.74, + "grad_norm": 1.217348221829534, + "learning_rate": 3.3815867570707028e-06, + "loss": 0.4838, + "step": 9517 + }, + { + "epoch": 0.74, + "grad_norm": 1.3387243638491388, + "learning_rate": 3.3797033573267546e-06, + "loss": 0.5698, + "step": 9518 + }, + { + "epoch": 0.74, + "grad_norm": 1.2024565982718625, + "learning_rate": 3.377820375556291e-06, + "loss": 0.5233, + "step": 9519 + }, + { + "epoch": 0.74, + "grad_norm": 1.2012118201088033, + "learning_rate": 3.3759378118781917e-06, + "loss": 0.5467, + "step": 9520 + }, + { + "epoch": 0.74, + "grad_norm": 1.265468587747006, + "learning_rate": 3.3740556664113145e-06, + "loss": 0.5555, + "step": 9521 + }, + { + "epoch": 0.74, + "grad_norm": 1.1315449430277773, + "learning_rate": 3.372173939274492e-06, + "loss": 0.5347, + "step": 9522 + }, + { + "epoch": 0.74, + "grad_norm": 1.2533361737408477, + "learning_rate": 3.3702926305865202e-06, + "loss": 0.5456, + "step": 9523 + }, + { + "epoch": 0.74, + "grad_norm": 1.1186749886027787, + "learning_rate": 3.36841174046618e-06, + "loss": 0.4308, + "step": 9524 + }, + { + "epoch": 0.74, + "grad_norm": 1.1082871166909192, + "learning_rate": 3.3665312690322238e-06, + "loss": 0.4934, + "step": 9525 + }, + { + "epoch": 0.74, + "grad_norm": 1.2138847171809726, + "learning_rate": 3.3646512164033696e-06, + "loss": 0.5159, + "step": 9526 + }, + { + "epoch": 0.74, + "grad_norm": 1.1476236690625292, + "learning_rate": 3.362771582698321e-06, + "loss": 0.4966, + "step": 9527 + }, + { + "epoch": 0.74, + "grad_norm": 1.263644941889011, + "learning_rate": 3.3608923680357432e-06, + "loss": 0.5217, + "step": 9528 + }, + { + "epoch": 0.74, + "grad_norm": 1.1583409348016596, + "learning_rate": 3.359013572534283e-06, + "loss": 0.5223, + "step": 9529 + }, + { + "epoch": 0.74, + "grad_norm": 1.1760657083617911, + "learning_rate": 3.3571351963125596e-06, + "loss": 0.5018, + "step": 9530 + }, + { + "epoch": 0.74, + "grad_norm": 1.207479005003192, + "learning_rate": 3.3552572394891635e-06, + "loss": 0.4748, + "step": 9531 + }, + { + "epoch": 0.74, + "grad_norm": 1.155805244480373, + "learning_rate": 3.353379702182664e-06, + "loss": 0.4969, + "step": 9532 + }, + { + "epoch": 0.74, + "grad_norm": 1.1666799726181558, + "learning_rate": 3.3515025845115923e-06, + "loss": 0.4845, + "step": 9533 + }, + { + "epoch": 0.74, + "grad_norm": 1.2978001707076032, + "learning_rate": 3.349625886594464e-06, + "loss": 0.51, + "step": 9534 + }, + { + "epoch": 0.74, + "grad_norm": 1.2741658754681031, + "learning_rate": 3.3477496085497685e-06, + "loss": 0.5663, + "step": 9535 + }, + { + "epoch": 0.74, + "grad_norm": 1.2649210632332923, + "learning_rate": 3.345873750495957e-06, + "loss": 0.5349, + "step": 9536 + }, + { + "epoch": 0.74, + "grad_norm": 1.2388800487906175, + "learning_rate": 3.343998312551471e-06, + "loss": 0.5229, + "step": 9537 + }, + { + "epoch": 0.74, + "grad_norm": 1.1441923297153567, + "learning_rate": 3.3421232948347084e-06, + "loss": 0.5154, + "step": 9538 + }, + { + "epoch": 0.74, + "grad_norm": 1.0550261907354799, + "learning_rate": 3.3402486974640522e-06, + "loss": 0.4497, + "step": 9539 + }, + { + "epoch": 0.74, + "grad_norm": 1.1793390732121094, + "learning_rate": 3.3383745205578555e-06, + "loss": 0.534, + "step": 9540 + }, + { + "epoch": 0.74, + "grad_norm": 1.2234965596370864, + "learning_rate": 3.3365007642344447e-06, + "loss": 0.5412, + "step": 9541 + }, + { + "epoch": 0.74, + "grad_norm": 1.2635890455973273, + "learning_rate": 3.334627428612124e-06, + "loss": 0.4643, + "step": 9542 + }, + { + "epoch": 0.74, + "grad_norm": 1.2149688067572466, + "learning_rate": 3.3327545138091576e-06, + "loss": 0.5446, + "step": 9543 + }, + { + "epoch": 0.74, + "grad_norm": 1.1730049788913282, + "learning_rate": 3.3308820199437987e-06, + "loss": 0.5009, + "step": 9544 + }, + { + "epoch": 0.74, + "grad_norm": 1.2804875314858442, + "learning_rate": 3.3290099471342687e-06, + "loss": 0.5835, + "step": 9545 + }, + { + "epoch": 0.74, + "grad_norm": 1.2764840222145815, + "learning_rate": 3.3271382954987554e-06, + "loss": 0.5901, + "step": 9546 + }, + { + "epoch": 0.74, + "grad_norm": 1.1999281484232625, + "learning_rate": 3.325267065155431e-06, + "loss": 0.5276, + "step": 9547 + }, + { + "epoch": 0.74, + "grad_norm": 1.1760959647623928, + "learning_rate": 3.323396256222432e-06, + "loss": 0.5212, + "step": 9548 + }, + { + "epoch": 0.74, + "grad_norm": 1.1815716179488593, + "learning_rate": 3.321525868817873e-06, + "loss": 0.5038, + "step": 9549 + }, + { + "epoch": 0.74, + "grad_norm": 1.3645097067191698, + "learning_rate": 3.319655903059843e-06, + "loss": 0.5188, + "step": 9550 + }, + { + "epoch": 0.74, + "grad_norm": 1.2459354599055037, + "learning_rate": 3.3177863590664027e-06, + "loss": 0.5191, + "step": 9551 + }, + { + "epoch": 0.74, + "grad_norm": 1.3007752931017413, + "learning_rate": 3.315917236955587e-06, + "loss": 0.5706, + "step": 9552 + }, + { + "epoch": 0.74, + "grad_norm": 1.2057969779382451, + "learning_rate": 3.314048536845399e-06, + "loss": 0.537, + "step": 9553 + }, + { + "epoch": 0.74, + "grad_norm": 1.2124738729749147, + "learning_rate": 3.312180258853822e-06, + "loss": 0.5345, + "step": 9554 + }, + { + "epoch": 0.74, + "grad_norm": 1.0849620362421304, + "learning_rate": 3.3103124030988133e-06, + "loss": 0.4574, + "step": 9555 + }, + { + "epoch": 0.74, + "grad_norm": 1.1479840064446605, + "learning_rate": 3.308444969698292e-06, + "loss": 0.5157, + "step": 9556 + }, + { + "epoch": 0.74, + "grad_norm": 1.1782256409741256, + "learning_rate": 3.3065779587701686e-06, + "loss": 0.5117, + "step": 9557 + }, + { + "epoch": 0.74, + "grad_norm": 1.195709461792309, + "learning_rate": 3.3047113704323085e-06, + "loss": 0.4774, + "step": 9558 + }, + { + "epoch": 0.74, + "grad_norm": 1.176182725869622, + "learning_rate": 3.302845204802563e-06, + "loss": 0.5136, + "step": 9559 + }, + { + "epoch": 0.74, + "grad_norm": 1.3085574927929533, + "learning_rate": 3.300979461998751e-06, + "loss": 0.5196, + "step": 9560 + }, + { + "epoch": 0.74, + "grad_norm": 1.2593984615895066, + "learning_rate": 3.2991141421386696e-06, + "loss": 0.5173, + "step": 9561 + }, + { + "epoch": 0.74, + "grad_norm": 1.2253522794571168, + "learning_rate": 3.2972492453400873e-06, + "loss": 0.5662, + "step": 9562 + }, + { + "epoch": 0.74, + "grad_norm": 1.2340296491557334, + "learning_rate": 3.2953847717207375e-06, + "loss": 0.5348, + "step": 9563 + }, + { + "epoch": 0.74, + "grad_norm": 1.1331738125975577, + "learning_rate": 3.29352072139834e-06, + "loss": 0.5102, + "step": 9564 + }, + { + "epoch": 0.74, + "grad_norm": 1.257181281696201, + "learning_rate": 3.291657094490582e-06, + "loss": 0.5326, + "step": 9565 + }, + { + "epoch": 0.74, + "grad_norm": 1.2155917891253631, + "learning_rate": 3.2897938911151196e-06, + "loss": 0.5237, + "step": 9566 + }, + { + "epoch": 0.74, + "grad_norm": 1.195306316683864, + "learning_rate": 3.287931111389593e-06, + "loss": 0.5074, + "step": 9567 + }, + { + "epoch": 0.74, + "grad_norm": 1.289391400542176, + "learning_rate": 3.2860687554316006e-06, + "loss": 0.5756, + "step": 9568 + }, + { + "epoch": 0.74, + "grad_norm": 1.1225045926652168, + "learning_rate": 3.2842068233587275e-06, + "loss": 0.4464, + "step": 9569 + }, + { + "epoch": 0.74, + "grad_norm": 1.0891565049701193, + "learning_rate": 3.2823453152885266e-06, + "loss": 0.4572, + "step": 9570 + }, + { + "epoch": 0.74, + "grad_norm": 1.2095725457558757, + "learning_rate": 3.280484231338524e-06, + "loss": 0.5182, + "step": 9571 + }, + { + "epoch": 0.74, + "grad_norm": 1.1536845020619162, + "learning_rate": 3.2786235716262236e-06, + "loss": 0.5242, + "step": 9572 + }, + { + "epoch": 0.74, + "grad_norm": 1.2496519080910367, + "learning_rate": 3.276763336269092e-06, + "loss": 0.507, + "step": 9573 + }, + { + "epoch": 0.74, + "grad_norm": 1.1976518863566135, + "learning_rate": 3.2749035253845773e-06, + "loss": 0.5128, + "step": 9574 + }, + { + "epoch": 0.74, + "grad_norm": 1.3234836935113166, + "learning_rate": 3.273044139090105e-06, + "loss": 0.5381, + "step": 9575 + }, + { + "epoch": 0.74, + "grad_norm": 1.2696282921895148, + "learning_rate": 3.271185177503058e-06, + "loss": 0.5333, + "step": 9576 + }, + { + "epoch": 0.74, + "grad_norm": 1.1917294423479057, + "learning_rate": 3.2693266407408064e-06, + "loss": 0.4782, + "step": 9577 + }, + { + "epoch": 0.74, + "grad_norm": 1.1496572439821997, + "learning_rate": 3.267468528920694e-06, + "loss": 0.5007, + "step": 9578 + }, + { + "epoch": 0.74, + "grad_norm": 1.0720940680225253, + "learning_rate": 3.265610842160025e-06, + "loss": 0.4496, + "step": 9579 + }, + { + "epoch": 0.74, + "grad_norm": 1.2297235563583229, + "learning_rate": 3.263753580576089e-06, + "loss": 0.5162, + "step": 9580 + }, + { + "epoch": 0.74, + "grad_norm": 1.224461450723111, + "learning_rate": 3.261896744286144e-06, + "loss": 0.498, + "step": 9581 + }, + { + "epoch": 0.74, + "grad_norm": 1.1557453703922704, + "learning_rate": 3.2600403334074205e-06, + "loss": 0.5217, + "step": 9582 + }, + { + "epoch": 0.74, + "grad_norm": 1.1602727144397882, + "learning_rate": 3.2581843480571285e-06, + "loss": 0.5032, + "step": 9583 + }, + { + "epoch": 0.74, + "grad_norm": 1.2238037769319816, + "learning_rate": 3.2563287883524376e-06, + "loss": 0.5503, + "step": 9584 + }, + { + "epoch": 0.74, + "grad_norm": 1.1477382373543588, + "learning_rate": 3.254473654410507e-06, + "loss": 0.5313, + "step": 9585 + }, + { + "epoch": 0.74, + "grad_norm": 1.227598882293441, + "learning_rate": 3.2526189463484536e-06, + "loss": 0.5105, + "step": 9586 + }, + { + "epoch": 0.74, + "grad_norm": 1.2740430737703992, + "learning_rate": 3.250764664283378e-06, + "loss": 0.5357, + "step": 9587 + }, + { + "epoch": 0.74, + "grad_norm": 1.1937142451658767, + "learning_rate": 3.248910808332354e-06, + "loss": 0.468, + "step": 9588 + }, + { + "epoch": 0.74, + "grad_norm": 1.2481086250037112, + "learning_rate": 3.2470573786124184e-06, + "loss": 0.511, + "step": 9589 + }, + { + "epoch": 0.74, + "grad_norm": 1.140881914442385, + "learning_rate": 3.2452043752405914e-06, + "loss": 0.4756, + "step": 9590 + }, + { + "epoch": 0.74, + "grad_norm": 1.1246766049428927, + "learning_rate": 3.2433517983338627e-06, + "loss": 0.4674, + "step": 9591 + }, + { + "epoch": 0.74, + "grad_norm": 1.1718132511400805, + "learning_rate": 3.241499648009193e-06, + "loss": 0.5062, + "step": 9592 + }, + { + "epoch": 0.74, + "grad_norm": 1.2260687253305598, + "learning_rate": 3.2396479243835243e-06, + "loss": 0.4923, + "step": 9593 + }, + { + "epoch": 0.74, + "grad_norm": 1.3285709754295572, + "learning_rate": 3.237796627573757e-06, + "loss": 0.5492, + "step": 9594 + }, + { + "epoch": 0.74, + "grad_norm": 1.2990738026868491, + "learning_rate": 3.2359457576967812e-06, + "loss": 0.5298, + "step": 9595 + }, + { + "epoch": 0.74, + "grad_norm": 1.1824105772922286, + "learning_rate": 3.2340953148694444e-06, + "loss": 0.4678, + "step": 9596 + }, + { + "epoch": 0.74, + "grad_norm": 1.1391189247253153, + "learning_rate": 3.2322452992085775e-06, + "loss": 0.4816, + "step": 9597 + }, + { + "epoch": 0.74, + "grad_norm": 1.1966098402037928, + "learning_rate": 3.2303957108309846e-06, + "loss": 0.4823, + "step": 9598 + }, + { + "epoch": 0.74, + "grad_norm": 1.1605868583989596, + "learning_rate": 3.2285465498534343e-06, + "loss": 0.5056, + "step": 9599 + }, + { + "epoch": 0.74, + "grad_norm": 1.1648864901995553, + "learning_rate": 3.2266978163926765e-06, + "loss": 0.4967, + "step": 9600 + }, + { + "epoch": 0.74, + "grad_norm": 1.053097975296423, + "learning_rate": 3.2248495105654308e-06, + "loss": 0.4267, + "step": 9601 + }, + { + "epoch": 0.74, + "grad_norm": 1.269275299093629, + "learning_rate": 3.2230016324883906e-06, + "loss": 0.5859, + "step": 9602 + }, + { + "epoch": 0.74, + "grad_norm": 1.260502326303444, + "learning_rate": 3.2211541822782255e-06, + "loss": 0.5236, + "step": 9603 + }, + { + "epoch": 0.75, + "grad_norm": 1.2566710795992027, + "learning_rate": 3.2193071600515678e-06, + "loss": 0.5346, + "step": 9604 + }, + { + "epoch": 0.75, + "grad_norm": 1.1875216833443867, + "learning_rate": 3.2174605659250367e-06, + "loss": 0.5626, + "step": 9605 + }, + { + "epoch": 0.75, + "grad_norm": 1.1233260628684765, + "learning_rate": 3.21561440001521e-06, + "loss": 0.4808, + "step": 9606 + }, + { + "epoch": 0.75, + "grad_norm": 1.2106343535936837, + "learning_rate": 3.213768662438649e-06, + "loss": 0.5286, + "step": 9607 + }, + { + "epoch": 0.75, + "grad_norm": 1.2317500149645764, + "learning_rate": 3.2119233533118864e-06, + "loss": 0.5129, + "step": 9608 + }, + { + "epoch": 0.75, + "grad_norm": 1.273805857548278, + "learning_rate": 3.2100784727514235e-06, + "loss": 0.5209, + "step": 9609 + }, + { + "epoch": 0.75, + "grad_norm": 1.2752639216859851, + "learning_rate": 3.2082340208737363e-06, + "loss": 0.4705, + "step": 9610 + }, + { + "epoch": 0.75, + "grad_norm": 1.2152096114644306, + "learning_rate": 3.206389997795277e-06, + "loss": 0.5098, + "step": 9611 + }, + { + "epoch": 0.75, + "grad_norm": 1.139154557640684, + "learning_rate": 3.204546403632468e-06, + "loss": 0.4852, + "step": 9612 + }, + { + "epoch": 0.75, + "grad_norm": 1.2513026126009654, + "learning_rate": 3.2027032385017065e-06, + "loss": 0.5046, + "step": 9613 + }, + { + "epoch": 0.75, + "grad_norm": 1.2896581949537642, + "learning_rate": 3.2008605025193564e-06, + "loss": 0.5802, + "step": 9614 + }, + { + "epoch": 0.75, + "grad_norm": 1.2939197926433892, + "learning_rate": 3.199018195801765e-06, + "loss": 0.5221, + "step": 9615 + }, + { + "epoch": 0.75, + "grad_norm": 1.300807230921914, + "learning_rate": 3.1971763184652404e-06, + "loss": 0.527, + "step": 9616 + }, + { + "epoch": 0.75, + "grad_norm": 1.2733365352367334, + "learning_rate": 3.1953348706260723e-06, + "loss": 0.5559, + "step": 9617 + }, + { + "epoch": 0.75, + "grad_norm": 1.218033066202077, + "learning_rate": 3.1934938524005243e-06, + "loss": 0.5444, + "step": 9618 + }, + { + "epoch": 0.75, + "grad_norm": 1.2230639280764783, + "learning_rate": 3.1916532639048237e-06, + "loss": 0.527, + "step": 9619 + }, + { + "epoch": 0.75, + "grad_norm": 1.244658214273411, + "learning_rate": 3.1898131052551784e-06, + "loss": 0.5449, + "step": 9620 + }, + { + "epoch": 0.75, + "grad_norm": 1.2124241716722137, + "learning_rate": 3.187973376567769e-06, + "loss": 0.4891, + "step": 9621 + }, + { + "epoch": 0.75, + "grad_norm": 1.2611029099287843, + "learning_rate": 3.1861340779587444e-06, + "loss": 0.5272, + "step": 9622 + }, + { + "epoch": 0.75, + "grad_norm": 1.234839919213569, + "learning_rate": 3.1842952095442335e-06, + "loss": 0.5283, + "step": 9623 + }, + { + "epoch": 0.75, + "grad_norm": 1.155577180325893, + "learning_rate": 3.182456771440329e-06, + "loss": 0.5187, + "step": 9624 + }, + { + "epoch": 0.75, + "grad_norm": 1.1982110974693967, + "learning_rate": 3.1806187637631035e-06, + "loss": 0.4872, + "step": 9625 + }, + { + "epoch": 0.75, + "grad_norm": 1.3175504336032817, + "learning_rate": 3.178781186628597e-06, + "loss": 0.5188, + "step": 9626 + }, + { + "epoch": 0.75, + "grad_norm": 1.132336957365549, + "learning_rate": 3.176944040152826e-06, + "loss": 0.4636, + "step": 9627 + }, + { + "epoch": 0.75, + "grad_norm": 1.1718656921017068, + "learning_rate": 3.1751073244517817e-06, + "loss": 0.4784, + "step": 9628 + }, + { + "epoch": 0.75, + "grad_norm": 1.2166551534987615, + "learning_rate": 3.1732710396414257e-06, + "loss": 0.5338, + "step": 9629 + }, + { + "epoch": 0.75, + "grad_norm": 1.220696923812374, + "learning_rate": 3.1714351858376867e-06, + "loss": 0.498, + "step": 9630 + }, + { + "epoch": 0.75, + "grad_norm": 1.2440496917502224, + "learning_rate": 3.1695997631564756e-06, + "loss": 0.5264, + "step": 9631 + }, + { + "epoch": 0.75, + "grad_norm": 1.2735555395170235, + "learning_rate": 3.167764771713673e-06, + "loss": 0.5668, + "step": 9632 + }, + { + "epoch": 0.75, + "grad_norm": 1.2009856785310227, + "learning_rate": 3.165930211625131e-06, + "loss": 0.4957, + "step": 9633 + }, + { + "epoch": 0.75, + "grad_norm": 1.1969525922120872, + "learning_rate": 3.1640960830066723e-06, + "loss": 0.5027, + "step": 9634 + }, + { + "epoch": 0.75, + "grad_norm": 1.141580573314053, + "learning_rate": 3.1622623859740998e-06, + "loss": 0.4756, + "step": 9635 + }, + { + "epoch": 0.75, + "grad_norm": 1.2348034272144184, + "learning_rate": 3.160429120643177e-06, + "loss": 0.5102, + "step": 9636 + }, + { + "epoch": 0.75, + "grad_norm": 1.1220948537827005, + "learning_rate": 3.1585962871296514e-06, + "loss": 0.5148, + "step": 9637 + }, + { + "epoch": 0.75, + "grad_norm": 1.1861523460012184, + "learning_rate": 3.15676388554924e-06, + "loss": 0.5513, + "step": 9638 + }, + { + "epoch": 0.75, + "grad_norm": 1.1555888889214319, + "learning_rate": 3.154931916017633e-06, + "loss": 0.5081, + "step": 9639 + }, + { + "epoch": 0.75, + "grad_norm": 1.2182523249262076, + "learning_rate": 3.1531003786504877e-06, + "loss": 0.5487, + "step": 9640 + }, + { + "epoch": 0.75, + "grad_norm": 1.1503821795271751, + "learning_rate": 3.151269273563441e-06, + "loss": 0.4865, + "step": 9641 + }, + { + "epoch": 0.75, + "grad_norm": 1.1729633617874444, + "learning_rate": 3.149438600872099e-06, + "loss": 0.5045, + "step": 9642 + }, + { + "epoch": 0.75, + "grad_norm": 1.0840679514515028, + "learning_rate": 3.147608360692046e-06, + "loss": 0.4947, + "step": 9643 + }, + { + "epoch": 0.75, + "grad_norm": 1.1614280947639186, + "learning_rate": 3.1457785531388275e-06, + "loss": 0.5187, + "step": 9644 + }, + { + "epoch": 0.75, + "grad_norm": 1.1710159713639752, + "learning_rate": 3.1439491783279754e-06, + "loss": 0.5399, + "step": 9645 + }, + { + "epoch": 0.75, + "grad_norm": 1.2505672121589768, + "learning_rate": 3.1421202363749816e-06, + "loss": 0.5327, + "step": 9646 + }, + { + "epoch": 0.75, + "grad_norm": 1.210515496338738, + "learning_rate": 3.1402917273953183e-06, + "loss": 0.554, + "step": 9647 + }, + { + "epoch": 0.75, + "grad_norm": 1.2018282453832836, + "learning_rate": 3.13846365150443e-06, + "loss": 0.5085, + "step": 9648 + }, + { + "epoch": 0.75, + "grad_norm": 1.1599178390345357, + "learning_rate": 3.136636008817736e-06, + "loss": 0.5033, + "step": 9649 + }, + { + "epoch": 0.75, + "grad_norm": 1.1208079967447666, + "learning_rate": 3.134808799450617e-06, + "loss": 0.4635, + "step": 9650 + }, + { + "epoch": 0.75, + "grad_norm": 1.1668448255383752, + "learning_rate": 3.1329820235184392e-06, + "loss": 0.5017, + "step": 9651 + }, + { + "epoch": 0.75, + "grad_norm": 1.1973444304426357, + "learning_rate": 3.131155681136535e-06, + "loss": 0.5014, + "step": 9652 + }, + { + "epoch": 0.75, + "grad_norm": 1.2872798268471883, + "learning_rate": 3.129329772420214e-06, + "loss": 0.573, + "step": 9653 + }, + { + "epoch": 0.75, + "grad_norm": 1.1669371268718376, + "learning_rate": 3.12750429748475e-06, + "loss": 0.4799, + "step": 9654 + }, + { + "epoch": 0.75, + "grad_norm": 1.1191617154835367, + "learning_rate": 3.1256792564453997e-06, + "loss": 0.4965, + "step": 9655 + }, + { + "epoch": 0.75, + "grad_norm": 1.1682877631195296, + "learning_rate": 3.1238546494173814e-06, + "loss": 0.4773, + "step": 9656 + }, + { + "epoch": 0.75, + "grad_norm": 1.2555377364417983, + "learning_rate": 3.122030476515896e-06, + "loss": 0.5172, + "step": 9657 + }, + { + "epoch": 0.75, + "grad_norm": 1.2261297835471494, + "learning_rate": 3.120206737856112e-06, + "loss": 0.4972, + "step": 9658 + }, + { + "epoch": 0.75, + "grad_norm": 1.2282230774832554, + "learning_rate": 3.1183834335531748e-06, + "loss": 0.4974, + "step": 9659 + }, + { + "epoch": 0.75, + "grad_norm": 1.2060289387908014, + "learning_rate": 3.116560563722193e-06, + "loss": 0.4786, + "step": 9660 + }, + { + "epoch": 0.75, + "grad_norm": 1.1947679463531546, + "learning_rate": 3.1147381284782562e-06, + "loss": 0.545, + "step": 9661 + }, + { + "epoch": 0.75, + "grad_norm": 1.4195915583058847, + "learning_rate": 3.112916127936425e-06, + "loss": 0.5533, + "step": 9662 + }, + { + "epoch": 0.75, + "grad_norm": 1.1560191877779025, + "learning_rate": 3.111094562211735e-06, + "loss": 0.4798, + "step": 9663 + }, + { + "epoch": 0.75, + "grad_norm": 1.1619429781738808, + "learning_rate": 3.109273431419183e-06, + "loss": 0.4808, + "step": 9664 + }, + { + "epoch": 0.75, + "grad_norm": 1.1951582566018901, + "learning_rate": 3.107452735673755e-06, + "loss": 0.5223, + "step": 9665 + }, + { + "epoch": 0.75, + "grad_norm": 1.32950197532596, + "learning_rate": 3.1056324750903934e-06, + "loss": 0.5584, + "step": 9666 + }, + { + "epoch": 0.75, + "grad_norm": 1.2145058017716204, + "learning_rate": 3.1038126497840225e-06, + "loss": 0.5298, + "step": 9667 + }, + { + "epoch": 0.75, + "grad_norm": 1.1613504961298915, + "learning_rate": 3.1019932598695402e-06, + "loss": 0.4781, + "step": 9668 + }, + { + "epoch": 0.75, + "grad_norm": 1.2345857561035964, + "learning_rate": 3.100174305461815e-06, + "loss": 0.4983, + "step": 9669 + }, + { + "epoch": 0.75, + "grad_norm": 1.2016430930701383, + "learning_rate": 3.0983557866756818e-06, + "loss": 0.5158, + "step": 9670 + }, + { + "epoch": 0.75, + "grad_norm": 1.1028421963427857, + "learning_rate": 3.096537703625955e-06, + "loss": 0.5089, + "step": 9671 + }, + { + "epoch": 0.75, + "grad_norm": 1.2161646098852747, + "learning_rate": 3.0947200564274206e-06, + "loss": 0.4954, + "step": 9672 + }, + { + "epoch": 0.75, + "grad_norm": 1.351248864624224, + "learning_rate": 3.092902845194837e-06, + "loss": 0.5527, + "step": 9673 + }, + { + "epoch": 0.75, + "grad_norm": 1.076700790226702, + "learning_rate": 3.0910860700429315e-06, + "loss": 0.4599, + "step": 9674 + }, + { + "epoch": 0.75, + "grad_norm": 1.1896682820272197, + "learning_rate": 3.0892697310864107e-06, + "loss": 0.4777, + "step": 9675 + }, + { + "epoch": 0.75, + "grad_norm": 1.1848758514172655, + "learning_rate": 3.0874538284399424e-06, + "loss": 0.531, + "step": 9676 + }, + { + "epoch": 0.75, + "grad_norm": 1.0745385820927151, + "learning_rate": 3.0856383622181785e-06, + "loss": 0.5203, + "step": 9677 + }, + { + "epoch": 0.75, + "grad_norm": 1.1340142551313297, + "learning_rate": 3.083823332535738e-06, + "loss": 0.5117, + "step": 9678 + }, + { + "epoch": 0.75, + "grad_norm": 1.2985881382932598, + "learning_rate": 3.0820087395072172e-06, + "loss": 0.5805, + "step": 9679 + }, + { + "epoch": 0.75, + "grad_norm": 1.2501920552532362, + "learning_rate": 3.0801945832471736e-06, + "loss": 0.5804, + "step": 9680 + }, + { + "epoch": 0.75, + "grad_norm": 1.2163620075272463, + "learning_rate": 3.078380863870146e-06, + "loss": 0.4998, + "step": 9681 + }, + { + "epoch": 0.75, + "grad_norm": 1.114146595652614, + "learning_rate": 3.076567581490647e-06, + "loss": 0.4944, + "step": 9682 + }, + { + "epoch": 0.75, + "grad_norm": 1.2460728945526496, + "learning_rate": 3.0747547362231588e-06, + "loss": 0.5611, + "step": 9683 + }, + { + "epoch": 0.75, + "grad_norm": 1.2827749249139446, + "learning_rate": 3.072942328182131e-06, + "loss": 0.54, + "step": 9684 + }, + { + "epoch": 0.75, + "grad_norm": 1.2000758683539983, + "learning_rate": 3.071130357481996e-06, + "loss": 0.5028, + "step": 9685 + }, + { + "epoch": 0.75, + "grad_norm": 1.2700625672284762, + "learning_rate": 3.0693188242371464e-06, + "loss": 0.5593, + "step": 9686 + }, + { + "epoch": 0.75, + "grad_norm": 1.2973007973873976, + "learning_rate": 3.067507728561958e-06, + "loss": 0.4982, + "step": 9687 + }, + { + "epoch": 0.75, + "grad_norm": 1.0810890364595844, + "learning_rate": 3.065697070570772e-06, + "loss": 0.4986, + "step": 9688 + }, + { + "epoch": 0.75, + "grad_norm": 1.2541923315436387, + "learning_rate": 3.0638868503779075e-06, + "loss": 0.5283, + "step": 9689 + }, + { + "epoch": 0.75, + "grad_norm": 1.2070078924690613, + "learning_rate": 3.0620770680976554e-06, + "loss": 0.536, + "step": 9690 + }, + { + "epoch": 0.75, + "grad_norm": 1.1986306702506266, + "learning_rate": 3.060267723844269e-06, + "loss": 0.4594, + "step": 9691 + }, + { + "epoch": 0.75, + "grad_norm": 1.2195843506028903, + "learning_rate": 3.058458817731985e-06, + "loss": 0.5094, + "step": 9692 + }, + { + "epoch": 0.75, + "grad_norm": 1.2256512506719115, + "learning_rate": 3.0566503498750135e-06, + "loss": 0.5288, + "step": 9693 + }, + { + "epoch": 0.75, + "grad_norm": 1.198845373423006, + "learning_rate": 3.0548423203875245e-06, + "loss": 0.4933, + "step": 9694 + }, + { + "epoch": 0.75, + "grad_norm": 1.224672063412419, + "learning_rate": 3.0530347293836758e-06, + "loss": 0.4683, + "step": 9695 + }, + { + "epoch": 0.75, + "grad_norm": 1.269420020292581, + "learning_rate": 3.0512275769775834e-06, + "loss": 0.5721, + "step": 9696 + }, + { + "epoch": 0.75, + "grad_norm": 1.1948019196331745, + "learning_rate": 3.0494208632833443e-06, + "loss": 0.5035, + "step": 9697 + }, + { + "epoch": 0.75, + "grad_norm": 1.156843548578526, + "learning_rate": 3.0476145884150265e-06, + "loss": 0.4974, + "step": 9698 + }, + { + "epoch": 0.75, + "grad_norm": 1.2688441368470538, + "learning_rate": 3.0458087524866697e-06, + "loss": 0.5456, + "step": 9699 + }, + { + "epoch": 0.75, + "grad_norm": 1.1903322976434654, + "learning_rate": 3.0440033556122883e-06, + "loss": 0.5462, + "step": 9700 + }, + { + "epoch": 0.75, + "grad_norm": 1.0663380583594542, + "learning_rate": 3.04219839790586e-06, + "loss": 0.4863, + "step": 9701 + }, + { + "epoch": 0.75, + "grad_norm": 1.020107880852447, + "learning_rate": 3.0403938794813448e-06, + "loss": 0.4864, + "step": 9702 + }, + { + "epoch": 0.75, + "grad_norm": 1.1512526179313922, + "learning_rate": 3.0385898004526725e-06, + "loss": 0.4166, + "step": 9703 + }, + { + "epoch": 0.75, + "grad_norm": 1.1410519049236627, + "learning_rate": 3.0367861609337412e-06, + "loss": 0.4846, + "step": 9704 + }, + { + "epoch": 0.75, + "grad_norm": 1.1613091286017547, + "learning_rate": 3.0349829610384274e-06, + "loss": 0.4816, + "step": 9705 + }, + { + "epoch": 0.75, + "grad_norm": 1.112162727493802, + "learning_rate": 3.03318020088057e-06, + "loss": 0.5055, + "step": 9706 + }, + { + "epoch": 0.75, + "grad_norm": 1.1611980038141168, + "learning_rate": 3.031377880573991e-06, + "loss": 0.4837, + "step": 9707 + }, + { + "epoch": 0.75, + "grad_norm": 1.1704282858681387, + "learning_rate": 3.0295760002324804e-06, + "loss": 0.5137, + "step": 9708 + }, + { + "epoch": 0.75, + "grad_norm": 1.1833039486061843, + "learning_rate": 3.0277745599697996e-06, + "loss": 0.5305, + "step": 9709 + }, + { + "epoch": 0.75, + "grad_norm": 1.1505943338181377, + "learning_rate": 3.025973559899685e-06, + "loss": 0.51, + "step": 9710 + }, + { + "epoch": 0.75, + "grad_norm": 1.2589359360530683, + "learning_rate": 3.0241730001358383e-06, + "loss": 0.517, + "step": 9711 + }, + { + "epoch": 0.75, + "grad_norm": 1.2993040202517023, + "learning_rate": 3.0223728807919406e-06, + "loss": 0.6323, + "step": 9712 + }, + { + "epoch": 0.75, + "grad_norm": 1.2725565384780952, + "learning_rate": 3.020573201981646e-06, + "loss": 0.5138, + "step": 9713 + }, + { + "epoch": 0.75, + "grad_norm": 1.2987660328643698, + "learning_rate": 3.018773963818571e-06, + "loss": 0.5545, + "step": 9714 + }, + { + "epoch": 0.75, + "grad_norm": 1.2787221767944847, + "learning_rate": 3.016975166416317e-06, + "loss": 0.5264, + "step": 9715 + }, + { + "epoch": 0.75, + "grad_norm": 1.2301707073318102, + "learning_rate": 3.015176809888445e-06, + "loss": 0.5101, + "step": 9716 + }, + { + "epoch": 0.75, + "grad_norm": 1.309659381656543, + "learning_rate": 3.0133788943484987e-06, + "loss": 0.505, + "step": 9717 + }, + { + "epoch": 0.75, + "grad_norm": 1.1884990805982245, + "learning_rate": 3.011581419909988e-06, + "loss": 0.5427, + "step": 9718 + }, + { + "epoch": 0.75, + "grad_norm": 1.1365239871204527, + "learning_rate": 3.0097843866863985e-06, + "loss": 0.5214, + "step": 9719 + }, + { + "epoch": 0.75, + "grad_norm": 1.1672198596232213, + "learning_rate": 3.0079877947911883e-06, + "loss": 0.4931, + "step": 9720 + }, + { + "epoch": 0.75, + "grad_norm": 1.2158417849195762, + "learning_rate": 3.0061916443377805e-06, + "loss": 0.4731, + "step": 9721 + }, + { + "epoch": 0.75, + "grad_norm": 1.2957583756350022, + "learning_rate": 3.004395935439577e-06, + "loss": 0.5218, + "step": 9722 + }, + { + "epoch": 0.75, + "grad_norm": 1.2692954445968987, + "learning_rate": 3.002600668209953e-06, + "loss": 0.5631, + "step": 9723 + }, + { + "epoch": 0.75, + "grad_norm": 1.2135708140964503, + "learning_rate": 3.000805842762248e-06, + "loss": 0.5186, + "step": 9724 + }, + { + "epoch": 0.75, + "grad_norm": 1.1757931629477214, + "learning_rate": 2.999011459209784e-06, + "loss": 0.4773, + "step": 9725 + }, + { + "epoch": 0.75, + "grad_norm": 1.2630064448949543, + "learning_rate": 2.9972175176658448e-06, + "loss": 0.5139, + "step": 9726 + }, + { + "epoch": 0.75, + "grad_norm": 1.2904038357530452, + "learning_rate": 2.995424018243692e-06, + "loss": 0.5303, + "step": 9727 + }, + { + "epoch": 0.75, + "grad_norm": 1.0653556707110878, + "learning_rate": 2.9936309610565606e-06, + "loss": 0.4723, + "step": 9728 + }, + { + "epoch": 0.75, + "grad_norm": 1.2497233084575268, + "learning_rate": 2.9918383462176547e-06, + "loss": 0.4913, + "step": 9729 + }, + { + "epoch": 0.75, + "grad_norm": 1.2938495874084577, + "learning_rate": 2.9900461738401545e-06, + "loss": 0.5679, + "step": 9730 + }, + { + "epoch": 0.75, + "grad_norm": 1.2638106351015859, + "learning_rate": 2.988254444037203e-06, + "loss": 0.519, + "step": 9731 + }, + { + "epoch": 0.75, + "grad_norm": 1.204300392979703, + "learning_rate": 2.9864631569219237e-06, + "loss": 0.4885, + "step": 9732 + }, + { + "epoch": 0.76, + "grad_norm": 1.1726332690829093, + "learning_rate": 2.984672312607414e-06, + "loss": 0.4703, + "step": 9733 + }, + { + "epoch": 0.76, + "grad_norm": 1.2400835561287917, + "learning_rate": 2.982881911206733e-06, + "loss": 0.5033, + "step": 9734 + }, + { + "epoch": 0.76, + "grad_norm": 1.241585735696034, + "learning_rate": 2.981091952832923e-06, + "loss": 0.4944, + "step": 9735 + }, + { + "epoch": 0.76, + "grad_norm": 1.2364526960099262, + "learning_rate": 2.9793024375989877e-06, + "loss": 0.5357, + "step": 9736 + }, + { + "epoch": 0.76, + "grad_norm": 1.2138513270943172, + "learning_rate": 2.9775133656179113e-06, + "loss": 0.5658, + "step": 9737 + }, + { + "epoch": 0.76, + "grad_norm": 1.2376171461202323, + "learning_rate": 2.975724737002648e-06, + "loss": 0.557, + "step": 9738 + }, + { + "epoch": 0.76, + "grad_norm": 1.1327460960130638, + "learning_rate": 2.973936551866121e-06, + "loss": 0.4955, + "step": 9739 + }, + { + "epoch": 0.76, + "grad_norm": 1.077405219368041, + "learning_rate": 2.97214881032123e-06, + "loss": 0.4842, + "step": 9740 + }, + { + "epoch": 0.76, + "grad_norm": 1.423243440533533, + "learning_rate": 2.9703615124808484e-06, + "loss": 0.605, + "step": 9741 + }, + { + "epoch": 0.76, + "grad_norm": 1.0908098211782857, + "learning_rate": 2.9685746584578078e-06, + "loss": 0.4956, + "step": 9742 + }, + { + "epoch": 0.76, + "grad_norm": 1.2265318848500568, + "learning_rate": 2.966788248364929e-06, + "loss": 0.5175, + "step": 9743 + }, + { + "epoch": 0.76, + "grad_norm": 1.1097582839996938, + "learning_rate": 2.9650022823149925e-06, + "loss": 0.3929, + "step": 9744 + }, + { + "epoch": 0.76, + "grad_norm": 1.162524596333507, + "learning_rate": 2.9632167604207586e-06, + "loss": 0.4391, + "step": 9745 + }, + { + "epoch": 0.76, + "grad_norm": 1.2255626904231984, + "learning_rate": 2.9614316827949574e-06, + "loss": 0.4882, + "step": 9746 + }, + { + "epoch": 0.76, + "grad_norm": 1.1049093172138844, + "learning_rate": 2.959647049550286e-06, + "loss": 0.4703, + "step": 9747 + }, + { + "epoch": 0.76, + "grad_norm": 1.2790026610334093, + "learning_rate": 2.9578628607994187e-06, + "loss": 0.521, + "step": 9748 + }, + { + "epoch": 0.76, + "grad_norm": 1.2820677589922698, + "learning_rate": 2.956079116655003e-06, + "loss": 0.5336, + "step": 9749 + }, + { + "epoch": 0.76, + "grad_norm": 1.3268876089610115, + "learning_rate": 2.9542958172296533e-06, + "loss": 0.5115, + "step": 9750 + }, + { + "epoch": 0.76, + "grad_norm": 1.1877541521048391, + "learning_rate": 2.9525129626359637e-06, + "loss": 0.5069, + "step": 9751 + }, + { + "epoch": 0.76, + "grad_norm": 1.2870209219239013, + "learning_rate": 2.950730552986487e-06, + "loss": 0.523, + "step": 9752 + }, + { + "epoch": 0.76, + "grad_norm": 1.210638390793444, + "learning_rate": 2.948948588393764e-06, + "loss": 0.4904, + "step": 9753 + }, + { + "epoch": 0.76, + "grad_norm": 1.2198304986459967, + "learning_rate": 2.9471670689702927e-06, + "loss": 0.5251, + "step": 9754 + }, + { + "epoch": 0.76, + "grad_norm": 1.1133374116350807, + "learning_rate": 2.945385994828551e-06, + "loss": 0.4778, + "step": 9755 + }, + { + "epoch": 0.76, + "grad_norm": 1.266781925721349, + "learning_rate": 2.9436053660809914e-06, + "loss": 0.5526, + "step": 9756 + }, + { + "epoch": 0.76, + "grad_norm": 1.276616954225686, + "learning_rate": 2.941825182840029e-06, + "loss": 0.5367, + "step": 9757 + }, + { + "epoch": 0.76, + "grad_norm": 1.0861925882108026, + "learning_rate": 2.940045445218057e-06, + "loss": 0.4439, + "step": 9758 + }, + { + "epoch": 0.76, + "grad_norm": 1.223251246748148, + "learning_rate": 2.9382661533274424e-06, + "loss": 0.5487, + "step": 9759 + }, + { + "epoch": 0.76, + "grad_norm": 1.2432689637472019, + "learning_rate": 2.936487307280518e-06, + "loss": 0.5079, + "step": 9760 + }, + { + "epoch": 0.76, + "grad_norm": 1.341238735894706, + "learning_rate": 2.9347089071895963e-06, + "loss": 0.5597, + "step": 9761 + }, + { + "epoch": 0.76, + "grad_norm": 1.199765804405271, + "learning_rate": 2.9329309531669505e-06, + "loss": 0.4765, + "step": 9762 + }, + { + "epoch": 0.76, + "grad_norm": 1.2067155641234408, + "learning_rate": 2.931153445324837e-06, + "loss": 0.529, + "step": 9763 + }, + { + "epoch": 0.76, + "grad_norm": 1.2344159952430254, + "learning_rate": 2.929376383775475e-06, + "loss": 0.5688, + "step": 9764 + }, + { + "epoch": 0.76, + "grad_norm": 1.1055547997510957, + "learning_rate": 2.927599768631061e-06, + "loss": 0.5002, + "step": 9765 + }, + { + "epoch": 0.76, + "grad_norm": 1.223891636539499, + "learning_rate": 2.9258236000037656e-06, + "loss": 0.5093, + "step": 9766 + }, + { + "epoch": 0.76, + "grad_norm": 1.0899147912938831, + "learning_rate": 2.9240478780057214e-06, + "loss": 0.5142, + "step": 9767 + }, + { + "epoch": 0.76, + "grad_norm": 1.2348902146195369, + "learning_rate": 2.9222726027490413e-06, + "loss": 0.53, + "step": 9768 + }, + { + "epoch": 0.76, + "grad_norm": 1.276223674431669, + "learning_rate": 2.9204977743458084e-06, + "loss": 0.5536, + "step": 9769 + }, + { + "epoch": 0.76, + "grad_norm": 1.21900057050834, + "learning_rate": 2.918723392908076e-06, + "loss": 0.5431, + "step": 9770 + }, + { + "epoch": 0.76, + "grad_norm": 1.2101023101113764, + "learning_rate": 2.916949458547874e-06, + "loss": 0.5313, + "step": 9771 + }, + { + "epoch": 0.76, + "grad_norm": 1.1394050030882883, + "learning_rate": 2.9151759713771933e-06, + "loss": 0.5097, + "step": 9772 + }, + { + "epoch": 0.76, + "grad_norm": 1.2700705453848662, + "learning_rate": 2.9134029315080094e-06, + "loss": 0.5033, + "step": 9773 + }, + { + "epoch": 0.76, + "grad_norm": 1.2294417682275165, + "learning_rate": 2.911630339052257e-06, + "loss": 0.5199, + "step": 9774 + }, + { + "epoch": 0.76, + "grad_norm": 1.14783249054286, + "learning_rate": 2.909858194121853e-06, + "loss": 0.5145, + "step": 9775 + }, + { + "epoch": 0.76, + "grad_norm": 1.1075335518639304, + "learning_rate": 2.908086496828685e-06, + "loss": 0.4875, + "step": 9776 + }, + { + "epoch": 0.76, + "grad_norm": 1.1784986353766367, + "learning_rate": 2.906315247284602e-06, + "loss": 0.4948, + "step": 9777 + }, + { + "epoch": 0.76, + "grad_norm": 1.1317728994356206, + "learning_rate": 2.904544445601436e-06, + "loss": 0.482, + "step": 9778 + }, + { + "epoch": 0.76, + "grad_norm": 1.2665373257220667, + "learning_rate": 2.9027740918909873e-06, + "loss": 0.5572, + "step": 9779 + }, + { + "epoch": 0.76, + "grad_norm": 1.0476162691616189, + "learning_rate": 2.9010041862650273e-06, + "loss": 0.4903, + "step": 9780 + }, + { + "epoch": 0.76, + "grad_norm": 1.1946357357017348, + "learning_rate": 2.899234728835302e-06, + "loss": 0.4706, + "step": 9781 + }, + { + "epoch": 0.76, + "grad_norm": 1.1590615704847174, + "learning_rate": 2.8974657197135203e-06, + "loss": 0.5094, + "step": 9782 + }, + { + "epoch": 0.76, + "grad_norm": 1.3795955329216594, + "learning_rate": 2.895697159011375e-06, + "loss": 0.534, + "step": 9783 + }, + { + "epoch": 0.76, + "grad_norm": 1.1630723569703318, + "learning_rate": 2.893929046840518e-06, + "loss": 0.4747, + "step": 9784 + }, + { + "epoch": 0.76, + "grad_norm": 1.1057504355705519, + "learning_rate": 2.892161383312583e-06, + "loss": 0.5357, + "step": 9785 + }, + { + "epoch": 0.76, + "grad_norm": 1.1968714203544453, + "learning_rate": 2.8903941685391745e-06, + "loss": 0.4947, + "step": 9786 + }, + { + "epoch": 0.76, + "grad_norm": 1.2296795933237805, + "learning_rate": 2.8886274026318593e-06, + "loss": 0.5358, + "step": 9787 + }, + { + "epoch": 0.76, + "grad_norm": 1.2436282841152793, + "learning_rate": 2.886861085702186e-06, + "loss": 0.5366, + "step": 9788 + }, + { + "epoch": 0.76, + "grad_norm": 1.2671993965314863, + "learning_rate": 2.885095217861672e-06, + "loss": 0.5508, + "step": 9789 + }, + { + "epoch": 0.76, + "grad_norm": 1.2071877782368567, + "learning_rate": 2.8833297992218055e-06, + "loss": 0.5654, + "step": 9790 + }, + { + "epoch": 0.76, + "grad_norm": 1.2480367501912277, + "learning_rate": 2.881564829894048e-06, + "loss": 0.5357, + "step": 9791 + }, + { + "epoch": 0.76, + "grad_norm": 1.2153232033126546, + "learning_rate": 2.8798003099898297e-06, + "loss": 0.4976, + "step": 9792 + }, + { + "epoch": 0.76, + "grad_norm": 1.1746961078901785, + "learning_rate": 2.8780362396205495e-06, + "loss": 0.4913, + "step": 9793 + }, + { + "epoch": 0.76, + "grad_norm": 1.0602920254125476, + "learning_rate": 2.8762726188975876e-06, + "loss": 0.4982, + "step": 9794 + }, + { + "epoch": 0.76, + "grad_norm": 1.1111228498527537, + "learning_rate": 2.874509447932288e-06, + "loss": 0.4899, + "step": 9795 + }, + { + "epoch": 0.76, + "grad_norm": 1.1744743510596547, + "learning_rate": 2.872746726835969e-06, + "loss": 0.5362, + "step": 9796 + }, + { + "epoch": 0.76, + "grad_norm": 1.2808508832858856, + "learning_rate": 2.870984455719924e-06, + "loss": 0.5181, + "step": 9797 + }, + { + "epoch": 0.76, + "grad_norm": 1.2362708971901422, + "learning_rate": 2.8692226346954087e-06, + "loss": 0.4979, + "step": 9798 + }, + { + "epoch": 0.76, + "grad_norm": 1.2511913820878486, + "learning_rate": 2.8674612638736576e-06, + "loss": 0.5314, + "step": 9799 + }, + { + "epoch": 0.76, + "grad_norm": 1.3125190733477405, + "learning_rate": 2.865700343365877e-06, + "loss": 0.56, + "step": 9800 + }, + { + "epoch": 0.76, + "grad_norm": 1.2053376201279824, + "learning_rate": 2.8639398732832448e-06, + "loss": 0.5058, + "step": 9801 + }, + { + "epoch": 0.76, + "grad_norm": 1.3062222071707097, + "learning_rate": 2.862179853736905e-06, + "loss": 0.5553, + "step": 9802 + }, + { + "epoch": 0.76, + "grad_norm": 1.1668391043521174, + "learning_rate": 2.860420284837975e-06, + "loss": 0.5037, + "step": 9803 + }, + { + "epoch": 0.76, + "grad_norm": 1.2755488112114801, + "learning_rate": 2.858661166697547e-06, + "loss": 0.5874, + "step": 9804 + }, + { + "epoch": 0.76, + "grad_norm": 1.2417344040710159, + "learning_rate": 2.8569024994266848e-06, + "loss": 0.5099, + "step": 9805 + }, + { + "epoch": 0.76, + "grad_norm": 1.0850638298737119, + "learning_rate": 2.855144283136421e-06, + "loss": 0.491, + "step": 9806 + }, + { + "epoch": 0.76, + "grad_norm": 1.1009858698618142, + "learning_rate": 2.853386517937764e-06, + "loss": 0.4518, + "step": 9807 + }, + { + "epoch": 0.76, + "grad_norm": 1.1702659241151738, + "learning_rate": 2.8516292039416847e-06, + "loss": 0.4775, + "step": 9808 + }, + { + "epoch": 0.76, + "grad_norm": 1.1523150424290927, + "learning_rate": 2.8498723412591357e-06, + "loss": 0.4938, + "step": 9809 + }, + { + "epoch": 0.76, + "grad_norm": 1.1364860689792136, + "learning_rate": 2.848115930001034e-06, + "loss": 0.4626, + "step": 9810 + }, + { + "epoch": 0.76, + "grad_norm": 1.2122917232370285, + "learning_rate": 2.8463599702782764e-06, + "loss": 0.5119, + "step": 9811 + }, + { + "epoch": 0.76, + "grad_norm": 1.2063386281714108, + "learning_rate": 2.8446044622017223e-06, + "loss": 0.4895, + "step": 9812 + }, + { + "epoch": 0.76, + "grad_norm": 1.276436206262035, + "learning_rate": 2.842849405882202e-06, + "loss": 0.5402, + "step": 9813 + }, + { + "epoch": 0.76, + "grad_norm": 1.0930694097448517, + "learning_rate": 2.841094801430524e-06, + "loss": 0.482, + "step": 9814 + }, + { + "epoch": 0.76, + "grad_norm": 1.1627584744302293, + "learning_rate": 2.839340648957467e-06, + "loss": 0.5275, + "step": 9815 + }, + { + "epoch": 0.76, + "grad_norm": 1.4099065078690134, + "learning_rate": 2.8375869485737782e-06, + "loss": 0.6152, + "step": 9816 + }, + { + "epoch": 0.76, + "grad_norm": 1.3101427617797599, + "learning_rate": 2.8358337003901826e-06, + "loss": 0.5267, + "step": 9817 + }, + { + "epoch": 0.76, + "grad_norm": 1.0559675572642508, + "learning_rate": 2.8340809045173646e-06, + "loss": 0.4663, + "step": 9818 + }, + { + "epoch": 0.76, + "grad_norm": 1.266785266410155, + "learning_rate": 2.83232856106599e-06, + "loss": 0.5083, + "step": 9819 + }, + { + "epoch": 0.76, + "grad_norm": 1.1709059714233228, + "learning_rate": 2.830576670146694e-06, + "loss": 0.4736, + "step": 9820 + }, + { + "epoch": 0.76, + "grad_norm": 1.2864326237671966, + "learning_rate": 2.828825231870085e-06, + "loss": 0.5468, + "step": 9821 + }, + { + "epoch": 0.76, + "grad_norm": 1.312573067584006, + "learning_rate": 2.827074246346737e-06, + "loss": 0.5718, + "step": 9822 + }, + { + "epoch": 0.76, + "grad_norm": 1.201206212987339, + "learning_rate": 2.825323713687197e-06, + "loss": 0.4914, + "step": 9823 + }, + { + "epoch": 0.76, + "grad_norm": 1.2015729033140081, + "learning_rate": 2.823573634001987e-06, + "loss": 0.4981, + "step": 9824 + }, + { + "epoch": 0.76, + "grad_norm": 1.2069773739145497, + "learning_rate": 2.821824007401599e-06, + "loss": 0.5177, + "step": 9825 + }, + { + "epoch": 0.76, + "grad_norm": 1.305339216452509, + "learning_rate": 2.8200748339964966e-06, + "loss": 0.5806, + "step": 9826 + }, + { + "epoch": 0.76, + "grad_norm": 1.1048882244072646, + "learning_rate": 2.818326113897115e-06, + "loss": 0.4816, + "step": 9827 + }, + { + "epoch": 0.76, + "grad_norm": 0.9766718688756859, + "learning_rate": 2.8165778472138572e-06, + "loss": 0.4057, + "step": 9828 + }, + { + "epoch": 0.76, + "grad_norm": 1.307712315011487, + "learning_rate": 2.8148300340571e-06, + "loss": 0.5007, + "step": 9829 + }, + { + "epoch": 0.76, + "grad_norm": 1.1727222177552468, + "learning_rate": 2.813082674537194e-06, + "loss": 0.4607, + "step": 9830 + }, + { + "epoch": 0.76, + "grad_norm": 1.2639675354621294, + "learning_rate": 2.8113357687644615e-06, + "loss": 0.5698, + "step": 9831 + }, + { + "epoch": 0.76, + "grad_norm": 1.1162285590595595, + "learning_rate": 2.8095893168491907e-06, + "loss": 0.4915, + "step": 9832 + }, + { + "epoch": 0.76, + "grad_norm": 1.2145241565367146, + "learning_rate": 2.8078433189016406e-06, + "loss": 0.539, + "step": 9833 + }, + { + "epoch": 0.76, + "grad_norm": 1.1205390860648494, + "learning_rate": 2.8060977750320485e-06, + "loss": 0.4505, + "step": 9834 + }, + { + "epoch": 0.76, + "grad_norm": 1.2136081409939452, + "learning_rate": 2.8043526853506187e-06, + "loss": 0.5186, + "step": 9835 + }, + { + "epoch": 0.76, + "grad_norm": 1.2579950235750537, + "learning_rate": 2.80260804996753e-06, + "loss": 0.5054, + "step": 9836 + }, + { + "epoch": 0.76, + "grad_norm": 1.0470881458621706, + "learning_rate": 2.8008638689929314e-06, + "loss": 0.4611, + "step": 9837 + }, + { + "epoch": 0.76, + "grad_norm": 1.2401703585225536, + "learning_rate": 2.799120142536935e-06, + "loss": 0.5706, + "step": 9838 + }, + { + "epoch": 0.76, + "grad_norm": 1.2317386916028414, + "learning_rate": 2.7973768707096373e-06, + "loss": 0.5346, + "step": 9839 + }, + { + "epoch": 0.76, + "grad_norm": 1.0838109333019748, + "learning_rate": 2.795634053621098e-06, + "loss": 0.4016, + "step": 9840 + }, + { + "epoch": 0.76, + "grad_norm": 1.273861867018515, + "learning_rate": 2.793891691381353e-06, + "loss": 0.4628, + "step": 9841 + }, + { + "epoch": 0.76, + "grad_norm": 1.048437347639502, + "learning_rate": 2.792149784100404e-06, + "loss": 0.4574, + "step": 9842 + }, + { + "epoch": 0.76, + "grad_norm": 1.1666399180661071, + "learning_rate": 2.790408331888225e-06, + "loss": 0.5149, + "step": 9843 + }, + { + "epoch": 0.76, + "grad_norm": 1.1567957337693864, + "learning_rate": 2.788667334854764e-06, + "loss": 0.4376, + "step": 9844 + }, + { + "epoch": 0.76, + "grad_norm": 1.1276946115965643, + "learning_rate": 2.78692679310994e-06, + "loss": 0.4975, + "step": 9845 + }, + { + "epoch": 0.76, + "grad_norm": 1.1262328492276015, + "learning_rate": 2.7851867067636407e-06, + "loss": 0.4986, + "step": 9846 + }, + { + "epoch": 0.76, + "grad_norm": 1.166700254819657, + "learning_rate": 2.783447075925729e-06, + "loss": 0.4907, + "step": 9847 + }, + { + "epoch": 0.76, + "grad_norm": 1.2237907727845774, + "learning_rate": 2.78170790070604e-06, + "loss": 0.5076, + "step": 9848 + }, + { + "epoch": 0.76, + "grad_norm": 1.2771501773601277, + "learning_rate": 2.779969181214368e-06, + "loss": 0.5389, + "step": 9849 + }, + { + "epoch": 0.76, + "grad_norm": 1.1612377327885974, + "learning_rate": 2.7782309175604937e-06, + "loss": 0.4894, + "step": 9850 + }, + { + "epoch": 0.76, + "grad_norm": 1.2163480417660855, + "learning_rate": 2.7764931098541627e-06, + "loss": 0.5454, + "step": 9851 + }, + { + "epoch": 0.76, + "grad_norm": 1.1415034530159667, + "learning_rate": 2.7747557582050878e-06, + "loss": 0.5224, + "step": 9852 + }, + { + "epoch": 0.76, + "grad_norm": 1.1984250303015094, + "learning_rate": 2.7730188627229617e-06, + "loss": 0.5258, + "step": 9853 + }, + { + "epoch": 0.76, + "grad_norm": 1.2364348113927364, + "learning_rate": 2.7712824235174384e-06, + "loss": 0.5428, + "step": 9854 + }, + { + "epoch": 0.76, + "grad_norm": 1.1589806248392363, + "learning_rate": 2.769546440698151e-06, + "loss": 0.4916, + "step": 9855 + }, + { + "epoch": 0.76, + "grad_norm": 1.23084618970854, + "learning_rate": 2.767810914374701e-06, + "loss": 0.5467, + "step": 9856 + }, + { + "epoch": 0.76, + "grad_norm": 1.2831385115754264, + "learning_rate": 2.7660758446566616e-06, + "loss": 0.5767, + "step": 9857 + }, + { + "epoch": 0.76, + "grad_norm": 1.2239159380557223, + "learning_rate": 2.7643412316535788e-06, + "loss": 0.4891, + "step": 9858 + }, + { + "epoch": 0.76, + "grad_norm": 1.2424230772035525, + "learning_rate": 2.7626070754749623e-06, + "loss": 0.5698, + "step": 9859 + }, + { + "epoch": 0.76, + "grad_norm": 1.2754815669445625, + "learning_rate": 2.7608733762303007e-06, + "loss": 0.5527, + "step": 9860 + }, + { + "epoch": 0.76, + "grad_norm": 1.2197909189293077, + "learning_rate": 2.7591401340290546e-06, + "loss": 0.542, + "step": 9861 + }, + { + "epoch": 0.77, + "grad_norm": 1.2989702419458093, + "learning_rate": 2.7574073489806473e-06, + "loss": 0.5575, + "step": 9862 + }, + { + "epoch": 0.77, + "grad_norm": 1.1437430418042591, + "learning_rate": 2.7556750211944848e-06, + "loss": 0.5385, + "step": 9863 + }, + { + "epoch": 0.77, + "grad_norm": 1.2048709635678325, + "learning_rate": 2.7539431507799298e-06, + "loss": 0.5523, + "step": 9864 + }, + { + "epoch": 0.77, + "grad_norm": 1.2138302123007683, + "learning_rate": 2.752211737846329e-06, + "loss": 0.5013, + "step": 9865 + }, + { + "epoch": 0.77, + "grad_norm": 1.1721150978811634, + "learning_rate": 2.7504807825029946e-06, + "loss": 0.5307, + "step": 9866 + }, + { + "epoch": 0.77, + "grad_norm": 1.1296348464534995, + "learning_rate": 2.7487502848592107e-06, + "loss": 0.5224, + "step": 9867 + }, + { + "epoch": 0.77, + "grad_norm": 1.0287660102015583, + "learning_rate": 2.7470202450242368e-06, + "loss": 0.4638, + "step": 9868 + }, + { + "epoch": 0.77, + "grad_norm": 1.1097480254236276, + "learning_rate": 2.745290663107292e-06, + "loss": 0.5129, + "step": 9869 + }, + { + "epoch": 0.77, + "grad_norm": 1.185768722566527, + "learning_rate": 2.7435615392175763e-06, + "loss": 0.5038, + "step": 9870 + }, + { + "epoch": 0.77, + "grad_norm": 1.1983693746329496, + "learning_rate": 2.741832873464262e-06, + "loss": 0.533, + "step": 9871 + }, + { + "epoch": 0.77, + "grad_norm": 1.2524288898162346, + "learning_rate": 2.7401046659564833e-06, + "loss": 0.5055, + "step": 9872 + }, + { + "epoch": 0.77, + "grad_norm": 1.2555332264607286, + "learning_rate": 2.7383769168033557e-06, + "loss": 0.5082, + "step": 9873 + }, + { + "epoch": 0.77, + "grad_norm": 1.2141966737029612, + "learning_rate": 2.736649626113955e-06, + "loss": 0.5693, + "step": 9874 + }, + { + "epoch": 0.77, + "grad_norm": 1.147146108738421, + "learning_rate": 2.7349227939973388e-06, + "loss": 0.4981, + "step": 9875 + }, + { + "epoch": 0.77, + "grad_norm": 1.3258379823486097, + "learning_rate": 2.7331964205625282e-06, + "loss": 0.5669, + "step": 9876 + }, + { + "epoch": 0.77, + "grad_norm": 1.2285800061845278, + "learning_rate": 2.7314705059185196e-06, + "loss": 0.4827, + "step": 9877 + }, + { + "epoch": 0.77, + "grad_norm": 1.1998316289083573, + "learning_rate": 2.7297450501742817e-06, + "loss": 0.521, + "step": 9878 + }, + { + "epoch": 0.77, + "grad_norm": 1.197441449097756, + "learning_rate": 2.728020053438746e-06, + "loss": 0.496, + "step": 9879 + }, + { + "epoch": 0.77, + "grad_norm": 1.1359180370064426, + "learning_rate": 2.7262955158208215e-06, + "loss": 0.4892, + "step": 9880 + }, + { + "epoch": 0.77, + "grad_norm": 1.1325942256709565, + "learning_rate": 2.724571437429393e-06, + "loss": 0.4609, + "step": 9881 + }, + { + "epoch": 0.77, + "grad_norm": 1.1403616509752306, + "learning_rate": 2.722847818373302e-06, + "loss": 0.5117, + "step": 9882 + }, + { + "epoch": 0.77, + "grad_norm": 1.2522275626864017, + "learning_rate": 2.721124658761376e-06, + "loss": 0.5464, + "step": 9883 + }, + { + "epoch": 0.77, + "grad_norm": 1.191775105395397, + "learning_rate": 2.7194019587024024e-06, + "loss": 0.5021, + "step": 9884 + }, + { + "epoch": 0.77, + "grad_norm": 1.2361227289765888, + "learning_rate": 2.717679718305145e-06, + "loss": 0.537, + "step": 9885 + }, + { + "epoch": 0.77, + "grad_norm": 1.2553181527665496, + "learning_rate": 2.7159579376783397e-06, + "loss": 0.5136, + "step": 9886 + }, + { + "epoch": 0.77, + "grad_norm": 1.2008053103973155, + "learning_rate": 2.7142366169306898e-06, + "loss": 0.5227, + "step": 9887 + }, + { + "epoch": 0.77, + "grad_norm": 1.2377461140509411, + "learning_rate": 2.712515756170876e-06, + "loss": 0.549, + "step": 9888 + }, + { + "epoch": 0.77, + "grad_norm": 1.210476301412687, + "learning_rate": 2.710795355507537e-06, + "loss": 0.4939, + "step": 9889 + }, + { + "epoch": 0.77, + "grad_norm": 1.238102319346952, + "learning_rate": 2.709075415049298e-06, + "loss": 0.5496, + "step": 9890 + }, + { + "epoch": 0.77, + "grad_norm": 1.1124386995550088, + "learning_rate": 2.7073559349047406e-06, + "loss": 0.4835, + "step": 9891 + }, + { + "epoch": 0.77, + "grad_norm": 1.169190906038654, + "learning_rate": 2.705636915182429e-06, + "loss": 0.5328, + "step": 9892 + }, + { + "epoch": 0.77, + "grad_norm": 1.1445753590283219, + "learning_rate": 2.7039183559908954e-06, + "loss": 0.4852, + "step": 9893 + }, + { + "epoch": 0.77, + "grad_norm": 1.2546000715034669, + "learning_rate": 2.702200257438636e-06, + "loss": 0.4741, + "step": 9894 + }, + { + "epoch": 0.77, + "grad_norm": 1.1876169448288676, + "learning_rate": 2.700482619634126e-06, + "loss": 0.4768, + "step": 9895 + }, + { + "epoch": 0.77, + "grad_norm": 1.1513088946028456, + "learning_rate": 2.698765442685809e-06, + "loss": 0.4838, + "step": 9896 + }, + { + "epoch": 0.77, + "grad_norm": 1.1666610184033224, + "learning_rate": 2.6970487267020985e-06, + "loss": 0.5128, + "step": 9897 + }, + { + "epoch": 0.77, + "grad_norm": 1.0987791786181917, + "learning_rate": 2.695332471791384e-06, + "loss": 0.5172, + "step": 9898 + }, + { + "epoch": 0.77, + "grad_norm": 1.2175867446922293, + "learning_rate": 2.6936166780620143e-06, + "loss": 0.4417, + "step": 9899 + }, + { + "epoch": 0.77, + "grad_norm": 1.1778550220646553, + "learning_rate": 2.691901345622322e-06, + "loss": 0.5309, + "step": 9900 + }, + { + "epoch": 0.77, + "grad_norm": 1.2065606048418924, + "learning_rate": 2.6901864745806004e-06, + "loss": 0.533, + "step": 9901 + }, + { + "epoch": 0.77, + "grad_norm": 1.2534833058119357, + "learning_rate": 2.68847206504512e-06, + "loss": 0.5359, + "step": 9902 + }, + { + "epoch": 0.77, + "grad_norm": 1.1870311514214298, + "learning_rate": 2.6867581171241207e-06, + "loss": 0.5264, + "step": 9903 + }, + { + "epoch": 0.77, + "grad_norm": 1.2038317994621088, + "learning_rate": 2.685044630925816e-06, + "loss": 0.4904, + "step": 9904 + }, + { + "epoch": 0.77, + "grad_norm": 1.201498492734855, + "learning_rate": 2.6833316065583805e-06, + "loss": 0.4722, + "step": 9905 + }, + { + "epoch": 0.77, + "grad_norm": 1.1909770755403253, + "learning_rate": 2.6816190441299695e-06, + "loss": 0.5363, + "step": 9906 + }, + { + "epoch": 0.77, + "grad_norm": 1.3065477460800525, + "learning_rate": 2.6799069437487067e-06, + "loss": 0.5407, + "step": 9907 + }, + { + "epoch": 0.77, + "grad_norm": 1.2176764234675306, + "learning_rate": 2.678195305522686e-06, + "loss": 0.4867, + "step": 9908 + }, + { + "epoch": 0.77, + "grad_norm": 1.254108068083646, + "learning_rate": 2.676484129559973e-06, + "loss": 0.4984, + "step": 9909 + }, + { + "epoch": 0.77, + "grad_norm": 1.2363606670128513, + "learning_rate": 2.6747734159686012e-06, + "loss": 0.5248, + "step": 9910 + }, + { + "epoch": 0.77, + "grad_norm": 1.2031140512736145, + "learning_rate": 2.6730631648565753e-06, + "loss": 0.4716, + "step": 9911 + }, + { + "epoch": 0.77, + "grad_norm": 1.1020562438423822, + "learning_rate": 2.6713533763318724e-06, + "loss": 0.4827, + "step": 9912 + }, + { + "epoch": 0.77, + "grad_norm": 1.1987326068431798, + "learning_rate": 2.6696440505024423e-06, + "loss": 0.4959, + "step": 9913 + }, + { + "epoch": 0.77, + "grad_norm": 1.1936124792326168, + "learning_rate": 2.667935187476206e-06, + "loss": 0.4999, + "step": 9914 + }, + { + "epoch": 0.77, + "grad_norm": 1.2021843341639942, + "learning_rate": 2.666226787361046e-06, + "loss": 0.5125, + "step": 9915 + }, + { + "epoch": 0.77, + "grad_norm": 1.2260604122330965, + "learning_rate": 2.6645188502648266e-06, + "loss": 0.5427, + "step": 9916 + }, + { + "epoch": 0.77, + "grad_norm": 1.1891810915704066, + "learning_rate": 2.662811376295379e-06, + "loss": 0.4743, + "step": 9917 + }, + { + "epoch": 0.77, + "grad_norm": 1.2013367578494354, + "learning_rate": 2.661104365560504e-06, + "loss": 0.545, + "step": 9918 + }, + { + "epoch": 0.77, + "grad_norm": 1.2662350161721243, + "learning_rate": 2.6593978181679758e-06, + "loss": 0.5615, + "step": 9919 + }, + { + "epoch": 0.77, + "grad_norm": 1.227243319554038, + "learning_rate": 2.657691734225537e-06, + "loss": 0.5301, + "step": 9920 + }, + { + "epoch": 0.77, + "grad_norm": 1.3088219287128833, + "learning_rate": 2.655986113840897e-06, + "loss": 0.5568, + "step": 9921 + }, + { + "epoch": 0.77, + "grad_norm": 1.1906285483640324, + "learning_rate": 2.6542809571217445e-06, + "loss": 0.4635, + "step": 9922 + }, + { + "epoch": 0.77, + "grad_norm": 1.2277681778742962, + "learning_rate": 2.6525762641757336e-06, + "loss": 0.5062, + "step": 9923 + }, + { + "epoch": 0.77, + "grad_norm": 1.1496360389450744, + "learning_rate": 2.650872035110493e-06, + "loss": 0.5044, + "step": 9924 + }, + { + "epoch": 0.77, + "grad_norm": 1.084056020207639, + "learning_rate": 2.6491682700336165e-06, + "loss": 0.4536, + "step": 9925 + }, + { + "epoch": 0.77, + "grad_norm": 1.1961519871243682, + "learning_rate": 2.6474649690526697e-06, + "loss": 0.5105, + "step": 9926 + }, + { + "epoch": 0.77, + "grad_norm": 1.1817719693918878, + "learning_rate": 2.645762132275196e-06, + "loss": 0.4965, + "step": 9927 + }, + { + "epoch": 0.77, + "grad_norm": 1.2424026878672974, + "learning_rate": 2.6440597598087005e-06, + "loss": 0.5254, + "step": 9928 + }, + { + "epoch": 0.77, + "grad_norm": 1.1675381584384779, + "learning_rate": 2.642357851760666e-06, + "loss": 0.4911, + "step": 9929 + }, + { + "epoch": 0.77, + "grad_norm": 1.230200408255395, + "learning_rate": 2.640656408238542e-06, + "loss": 0.5253, + "step": 9930 + }, + { + "epoch": 0.77, + "grad_norm": 1.3127762185949048, + "learning_rate": 2.6389554293497455e-06, + "loss": 0.5661, + "step": 9931 + }, + { + "epoch": 0.77, + "grad_norm": 1.173912311905404, + "learning_rate": 2.6372549152016703e-06, + "loss": 0.4749, + "step": 9932 + }, + { + "epoch": 0.77, + "grad_norm": 1.1288718248022118, + "learning_rate": 2.6355548659016796e-06, + "loss": 0.4752, + "step": 9933 + }, + { + "epoch": 0.77, + "grad_norm": 1.2205444721119403, + "learning_rate": 2.633855281557108e-06, + "loss": 0.4786, + "step": 9934 + }, + { + "epoch": 0.77, + "grad_norm": 1.2400897565012714, + "learning_rate": 2.6321561622752543e-06, + "loss": 0.4998, + "step": 9935 + }, + { + "epoch": 0.77, + "grad_norm": 1.226849904981837, + "learning_rate": 2.6304575081633944e-06, + "loss": 0.4727, + "step": 9936 + }, + { + "epoch": 0.77, + "grad_norm": 1.2553298332113563, + "learning_rate": 2.628759319328774e-06, + "loss": 0.4812, + "step": 9937 + }, + { + "epoch": 0.77, + "grad_norm": 1.1466965747045235, + "learning_rate": 2.6270615958786094e-06, + "loss": 0.4982, + "step": 9938 + }, + { + "epoch": 0.77, + "grad_norm": 1.1735122433362208, + "learning_rate": 2.625364337920088e-06, + "loss": 0.527, + "step": 9939 + }, + { + "epoch": 0.77, + "grad_norm": 1.1700131694345968, + "learning_rate": 2.6236675455603634e-06, + "loss": 0.4744, + "step": 9940 + }, + { + "epoch": 0.77, + "grad_norm": 1.1405423800710872, + "learning_rate": 2.6219712189065616e-06, + "loss": 0.4521, + "step": 9941 + }, + { + "epoch": 0.77, + "grad_norm": 1.1533850668640135, + "learning_rate": 2.6202753580657813e-06, + "loss": 0.5114, + "step": 9942 + }, + { + "epoch": 0.77, + "grad_norm": 1.2608868953945076, + "learning_rate": 2.6185799631450926e-06, + "loss": 0.534, + "step": 9943 + }, + { + "epoch": 0.77, + "grad_norm": 1.1812717839407174, + "learning_rate": 2.6168850342515375e-06, + "loss": 0.4902, + "step": 9944 + }, + { + "epoch": 0.77, + "grad_norm": 1.1256638793330935, + "learning_rate": 2.6151905714921187e-06, + "loss": 0.489, + "step": 9945 + }, + { + "epoch": 0.77, + "grad_norm": 1.3486541591670242, + "learning_rate": 2.6134965749738195e-06, + "loss": 0.5749, + "step": 9946 + }, + { + "epoch": 0.77, + "grad_norm": 1.1043014564054794, + "learning_rate": 2.61180304480359e-06, + "loss": 0.4637, + "step": 9947 + }, + { + "epoch": 0.77, + "grad_norm": 1.1245948273809971, + "learning_rate": 2.6101099810883535e-06, + "loss": 0.5029, + "step": 9948 + }, + { + "epoch": 0.77, + "grad_norm": 1.3349345943416269, + "learning_rate": 2.6084173839350036e-06, + "loss": 0.5671, + "step": 9949 + }, + { + "epoch": 0.77, + "grad_norm": 1.0676599755229847, + "learning_rate": 2.6067252534503996e-06, + "loss": 0.4801, + "step": 9950 + }, + { + "epoch": 0.77, + "grad_norm": 1.263210064846937, + "learning_rate": 2.6050335897413713e-06, + "loss": 0.492, + "step": 9951 + }, + { + "epoch": 0.77, + "grad_norm": 1.2609827593654122, + "learning_rate": 2.6033423929147263e-06, + "loss": 0.5172, + "step": 9952 + }, + { + "epoch": 0.77, + "grad_norm": 1.2073606270071737, + "learning_rate": 2.6016516630772372e-06, + "loss": 0.4781, + "step": 9953 + }, + { + "epoch": 0.77, + "grad_norm": 1.2026512402220375, + "learning_rate": 2.5999614003356523e-06, + "loss": 0.4969, + "step": 9954 + }, + { + "epoch": 0.77, + "grad_norm": 1.1661161190925189, + "learning_rate": 2.5982716047966803e-06, + "loss": 0.5243, + "step": 9955 + }, + { + "epoch": 0.77, + "grad_norm": 1.1935415175767379, + "learning_rate": 2.59658227656701e-06, + "loss": 0.4837, + "step": 9956 + }, + { + "epoch": 0.77, + "grad_norm": 1.2761928961791225, + "learning_rate": 2.5948934157532968e-06, + "loss": 0.5632, + "step": 9957 + }, + { + "epoch": 0.77, + "grad_norm": 1.129951230996921, + "learning_rate": 2.5932050224621685e-06, + "loss": 0.5282, + "step": 9958 + }, + { + "epoch": 0.77, + "grad_norm": 1.266337394372129, + "learning_rate": 2.5915170968002236e-06, + "loss": 0.5493, + "step": 9959 + }, + { + "epoch": 0.77, + "grad_norm": 1.108274693240437, + "learning_rate": 2.589829638874026e-06, + "loss": 0.5348, + "step": 9960 + }, + { + "epoch": 0.77, + "grad_norm": 1.1932922444293494, + "learning_rate": 2.5881426487901127e-06, + "loss": 0.4623, + "step": 9961 + }, + { + "epoch": 0.77, + "grad_norm": 1.2101212734735998, + "learning_rate": 2.586456126654995e-06, + "loss": 0.496, + "step": 9962 + }, + { + "epoch": 0.77, + "grad_norm": 1.1229493737396894, + "learning_rate": 2.584770072575149e-06, + "loss": 0.494, + "step": 9963 + }, + { + "epoch": 0.77, + "grad_norm": 1.2912410475798146, + "learning_rate": 2.583084486657027e-06, + "loss": 0.507, + "step": 9964 + }, + { + "epoch": 0.77, + "grad_norm": 1.3017216854133336, + "learning_rate": 2.5813993690070504e-06, + "loss": 0.5185, + "step": 9965 + }, + { + "epoch": 0.77, + "grad_norm": 1.2180455935389933, + "learning_rate": 2.579714719731604e-06, + "loss": 0.5493, + "step": 9966 + }, + { + "epoch": 0.77, + "grad_norm": 1.266237040281112, + "learning_rate": 2.5780305389370507e-06, + "loss": 0.5473, + "step": 9967 + }, + { + "epoch": 0.77, + "grad_norm": 1.1009691953793233, + "learning_rate": 2.576346826729722e-06, + "loss": 0.4465, + "step": 9968 + }, + { + "epoch": 0.77, + "grad_norm": 1.2663953344120586, + "learning_rate": 2.5746635832159216e-06, + "loss": 0.4871, + "step": 9969 + }, + { + "epoch": 0.77, + "grad_norm": 1.1452919056538333, + "learning_rate": 2.572980808501919e-06, + "loss": 0.4802, + "step": 9970 + }, + { + "epoch": 0.77, + "grad_norm": 1.2000431867616592, + "learning_rate": 2.571298502693954e-06, + "loss": 0.5243, + "step": 9971 + }, + { + "epoch": 0.77, + "grad_norm": 1.081226366258881, + "learning_rate": 2.5696166658982413e-06, + "loss": 0.4674, + "step": 9972 + }, + { + "epoch": 0.77, + "grad_norm": 1.180248253538023, + "learning_rate": 2.5679352982209637e-06, + "loss": 0.5176, + "step": 9973 + }, + { + "epoch": 0.77, + "grad_norm": 1.260633822983754, + "learning_rate": 2.5662543997682756e-06, + "loss": 0.5136, + "step": 9974 + }, + { + "epoch": 0.77, + "grad_norm": 1.171092624167524, + "learning_rate": 2.5645739706463037e-06, + "loss": 0.4875, + "step": 9975 + }, + { + "epoch": 0.77, + "grad_norm": 1.1673263262976492, + "learning_rate": 2.5628940109611356e-06, + "loss": 0.4814, + "step": 9976 + }, + { + "epoch": 0.77, + "grad_norm": 1.2444236349644187, + "learning_rate": 2.5612145208188376e-06, + "loss": 0.573, + "step": 9977 + }, + { + "epoch": 0.77, + "grad_norm": 1.3020340210795742, + "learning_rate": 2.5595355003254473e-06, + "loss": 0.5376, + "step": 9978 + }, + { + "epoch": 0.77, + "grad_norm": 1.260800951970293, + "learning_rate": 2.557856949586972e-06, + "loss": 0.537, + "step": 9979 + }, + { + "epoch": 0.77, + "grad_norm": 1.2257463692150925, + "learning_rate": 2.5561788687093835e-06, + "loss": 0.4947, + "step": 9980 + }, + { + "epoch": 0.77, + "grad_norm": 1.2394408082635595, + "learning_rate": 2.554501257798624e-06, + "loss": 0.4917, + "step": 9981 + }, + { + "epoch": 0.77, + "grad_norm": 1.0865589257423698, + "learning_rate": 2.5528241169606147e-06, + "loss": 0.4663, + "step": 9982 + }, + { + "epoch": 0.77, + "grad_norm": 1.1127785601819211, + "learning_rate": 2.551147446301242e-06, + "loss": 0.4692, + "step": 9983 + }, + { + "epoch": 0.77, + "grad_norm": 1.0347785692568185, + "learning_rate": 2.5494712459263615e-06, + "loss": 0.4479, + "step": 9984 + }, + { + "epoch": 0.77, + "grad_norm": 1.279606579281968, + "learning_rate": 2.547795515941803e-06, + "loss": 0.5007, + "step": 9985 + }, + { + "epoch": 0.77, + "grad_norm": 1.2124789855551736, + "learning_rate": 2.5461202564533603e-06, + "loss": 0.5167, + "step": 9986 + }, + { + "epoch": 0.77, + "grad_norm": 1.2867602046919882, + "learning_rate": 2.544445467566802e-06, + "loss": 0.5212, + "step": 9987 + }, + { + "epoch": 0.77, + "grad_norm": 1.150153040032195, + "learning_rate": 2.5427711493878673e-06, + "loss": 0.5471, + "step": 9988 + }, + { + "epoch": 0.77, + "grad_norm": 1.2188503517504345, + "learning_rate": 2.5410973020222662e-06, + "loss": 0.5214, + "step": 9989 + }, + { + "epoch": 0.78, + "grad_norm": 1.1628562258398445, + "learning_rate": 2.539423925575676e-06, + "loss": 0.5012, + "step": 9990 + }, + { + "epoch": 0.78, + "grad_norm": 1.1414263275072591, + "learning_rate": 2.5377510201537427e-06, + "loss": 0.4883, + "step": 9991 + }, + { + "epoch": 0.78, + "grad_norm": 1.2317932750547442, + "learning_rate": 2.5360785858620863e-06, + "loss": 0.5331, + "step": 9992 + }, + { + "epoch": 0.78, + "grad_norm": 1.2076154856613535, + "learning_rate": 2.534406622806298e-06, + "loss": 0.504, + "step": 9993 + }, + { + "epoch": 0.78, + "grad_norm": 1.160594459259228, + "learning_rate": 2.532735131091937e-06, + "loss": 0.5296, + "step": 9994 + }, + { + "epoch": 0.78, + "grad_norm": 1.2041926415269755, + "learning_rate": 2.531064110824536e-06, + "loss": 0.5302, + "step": 9995 + }, + { + "epoch": 0.78, + "grad_norm": 1.1928340173768832, + "learning_rate": 2.52939356210959e-06, + "loss": 0.5035, + "step": 9996 + }, + { + "epoch": 0.78, + "grad_norm": 1.2604478511683186, + "learning_rate": 2.527723485052571e-06, + "loss": 0.5003, + "step": 9997 + }, + { + "epoch": 0.78, + "grad_norm": 1.1923388223867433, + "learning_rate": 2.526053879758924e-06, + "loss": 0.4868, + "step": 9998 + }, + { + "epoch": 0.78, + "grad_norm": 1.1555894562952946, + "learning_rate": 2.524384746334052e-06, + "loss": 0.5027, + "step": 9999 + }, + { + "epoch": 0.78, + "grad_norm": 1.1767387166986665, + "learning_rate": 2.5227160848833443e-06, + "loss": 0.491, + "step": 10000 + }, + { + "epoch": 0.78, + "grad_norm": 1.2164645162064749, + "learning_rate": 2.5210478955121444e-06, + "loss": 0.533, + "step": 10001 + }, + { + "epoch": 0.78, + "grad_norm": 1.2196183168072343, + "learning_rate": 2.5193801783257763e-06, + "loss": 0.5334, + "step": 10002 + }, + { + "epoch": 0.78, + "grad_norm": 1.1693086625106097, + "learning_rate": 2.5177129334295336e-06, + "loss": 0.4849, + "step": 10003 + }, + { + "epoch": 0.78, + "grad_norm": 1.2579749813927619, + "learning_rate": 2.5160461609286766e-06, + "loss": 0.4923, + "step": 10004 + }, + { + "epoch": 0.78, + "grad_norm": 1.3370630530965744, + "learning_rate": 2.514379860928441e-06, + "loss": 0.552, + "step": 10005 + }, + { + "epoch": 0.78, + "grad_norm": 1.1293249178887756, + "learning_rate": 2.5127140335340217e-06, + "loss": 0.4526, + "step": 10006 + }, + { + "epoch": 0.78, + "grad_norm": 1.1095443918631438, + "learning_rate": 2.511048678850595e-06, + "loss": 0.4645, + "step": 10007 + }, + { + "epoch": 0.78, + "grad_norm": 1.2825226859452796, + "learning_rate": 2.5093837969833067e-06, + "loss": 0.5017, + "step": 10008 + }, + { + "epoch": 0.78, + "grad_norm": 1.167332759943817, + "learning_rate": 2.507719388037262e-06, + "loss": 0.4616, + "step": 10009 + }, + { + "epoch": 0.78, + "grad_norm": 1.2167484769794, + "learning_rate": 2.5060554521175506e-06, + "loss": 0.4895, + "step": 10010 + }, + { + "epoch": 0.78, + "grad_norm": 1.0355720692508503, + "learning_rate": 2.504391989329219e-06, + "loss": 0.4541, + "step": 10011 + }, + { + "epoch": 0.78, + "grad_norm": 1.0623148588551712, + "learning_rate": 2.5027289997772942e-06, + "loss": 0.4406, + "step": 10012 + }, + { + "epoch": 0.78, + "grad_norm": 1.178867838482162, + "learning_rate": 2.5010664835667677e-06, + "loss": 0.4825, + "step": 10013 + }, + { + "epoch": 0.78, + "grad_norm": 1.1962115827109674, + "learning_rate": 2.499404440802604e-06, + "loss": 0.4951, + "step": 10014 + }, + { + "epoch": 0.78, + "grad_norm": 1.1557178303645517, + "learning_rate": 2.4977428715897357e-06, + "loss": 0.4658, + "step": 10015 + }, + { + "epoch": 0.78, + "grad_norm": 1.1800318840536121, + "learning_rate": 2.496081776033069e-06, + "loss": 0.5266, + "step": 10016 + }, + { + "epoch": 0.78, + "grad_norm": 1.1925639047034136, + "learning_rate": 2.494421154237473e-06, + "loss": 0.4502, + "step": 10017 + }, + { + "epoch": 0.78, + "grad_norm": 1.181635126778918, + "learning_rate": 2.4927610063077956e-06, + "loss": 0.5177, + "step": 10018 + }, + { + "epoch": 0.78, + "grad_norm": 1.2086557363779336, + "learning_rate": 2.4911013323488454e-06, + "loss": 0.5447, + "step": 10019 + }, + { + "epoch": 0.78, + "grad_norm": 1.148304367133798, + "learning_rate": 2.4894421324654084e-06, + "loss": 0.4846, + "step": 10020 + }, + { + "epoch": 0.78, + "grad_norm": 1.113414983915189, + "learning_rate": 2.487783406762242e-06, + "loss": 0.4754, + "step": 10021 + }, + { + "epoch": 0.78, + "grad_norm": 1.2629171532093313, + "learning_rate": 2.4861251553440645e-06, + "loss": 0.6041, + "step": 10022 + }, + { + "epoch": 0.78, + "grad_norm": 1.0953053451064958, + "learning_rate": 2.4844673783155716e-06, + "loss": 0.4727, + "step": 10023 + }, + { + "epoch": 0.78, + "grad_norm": 1.1419672971609276, + "learning_rate": 2.482810075781429e-06, + "loss": 0.4655, + "step": 10024 + }, + { + "epoch": 0.78, + "grad_norm": 1.307920231364143, + "learning_rate": 2.4811532478462697e-06, + "loss": 0.5604, + "step": 10025 + }, + { + "epoch": 0.78, + "grad_norm": 1.2963349930093868, + "learning_rate": 2.4794968946147012e-06, + "loss": 0.5272, + "step": 10026 + }, + { + "epoch": 0.78, + "grad_norm": 1.198603966402989, + "learning_rate": 2.4778410161912913e-06, + "loss": 0.507, + "step": 10027 + }, + { + "epoch": 0.78, + "grad_norm": 1.225843765677728, + "learning_rate": 2.4761856126805906e-06, + "loss": 0.5515, + "step": 10028 + }, + { + "epoch": 0.78, + "grad_norm": 1.119766619498295, + "learning_rate": 2.4745306841871063e-06, + "loss": 0.4824, + "step": 10029 + }, + { + "epoch": 0.78, + "grad_norm": 1.188048487030218, + "learning_rate": 2.4728762308153264e-06, + "loss": 0.522, + "step": 10030 + }, + { + "epoch": 0.78, + "grad_norm": 1.187309500823671, + "learning_rate": 2.4712222526697083e-06, + "loss": 0.5183, + "step": 10031 + }, + { + "epoch": 0.78, + "grad_norm": 1.186278216606256, + "learning_rate": 2.4695687498546694e-06, + "loss": 0.5115, + "step": 10032 + }, + { + "epoch": 0.78, + "grad_norm": 1.214572201520226, + "learning_rate": 2.4679157224746076e-06, + "loss": 0.4824, + "step": 10033 + }, + { + "epoch": 0.78, + "grad_norm": 1.1838195383982342, + "learning_rate": 2.4662631706338856e-06, + "loss": 0.5137, + "step": 10034 + }, + { + "epoch": 0.78, + "grad_norm": 1.1206735494705995, + "learning_rate": 2.4646110944368393e-06, + "loss": 0.5, + "step": 10035 + }, + { + "epoch": 0.78, + "grad_norm": 1.1044551124724868, + "learning_rate": 2.4629594939877754e-06, + "loss": 0.4861, + "step": 10036 + }, + { + "epoch": 0.78, + "grad_norm": 1.165729276681926, + "learning_rate": 2.461308369390961e-06, + "loss": 0.4922, + "step": 10037 + }, + { + "epoch": 0.78, + "grad_norm": 1.2298576654095985, + "learning_rate": 2.4596577207506477e-06, + "loss": 0.5183, + "step": 10038 + }, + { + "epoch": 0.78, + "grad_norm": 1.2426334756121429, + "learning_rate": 2.458007548171042e-06, + "loss": 0.5322, + "step": 10039 + }, + { + "epoch": 0.78, + "grad_norm": 1.2618085043903091, + "learning_rate": 2.4563578517563314e-06, + "loss": 0.5362, + "step": 10040 + }, + { + "epoch": 0.78, + "grad_norm": 1.2909303945781236, + "learning_rate": 2.4547086316106727e-06, + "loss": 0.5335, + "step": 10041 + }, + { + "epoch": 0.78, + "grad_norm": 1.2597374728110449, + "learning_rate": 2.453059887838184e-06, + "loss": 0.4994, + "step": 10042 + }, + { + "epoch": 0.78, + "grad_norm": 1.2404129500499466, + "learning_rate": 2.451411620542962e-06, + "loss": 0.5025, + "step": 10043 + }, + { + "epoch": 0.78, + "grad_norm": 1.2238815067068816, + "learning_rate": 2.4497638298290693e-06, + "loss": 0.4968, + "step": 10044 + }, + { + "epoch": 0.78, + "grad_norm": 1.1828750593807622, + "learning_rate": 2.4481165158005395e-06, + "loss": 0.4855, + "step": 10045 + }, + { + "epoch": 0.78, + "grad_norm": 1.171897531928569, + "learning_rate": 2.4464696785613805e-06, + "loss": 0.4971, + "step": 10046 + }, + { + "epoch": 0.78, + "grad_norm": 1.1963336547014285, + "learning_rate": 2.444823318215559e-06, + "loss": 0.4717, + "step": 10047 + }, + { + "epoch": 0.78, + "grad_norm": 1.2407457634021282, + "learning_rate": 2.4431774348670236e-06, + "loss": 0.5609, + "step": 10048 + }, + { + "epoch": 0.78, + "grad_norm": 1.1881292081594819, + "learning_rate": 2.441532028619682e-06, + "loss": 0.5203, + "step": 10049 + }, + { + "epoch": 0.78, + "grad_norm": 1.2974123932914685, + "learning_rate": 2.4398870995774194e-06, + "loss": 0.5319, + "step": 10050 + }, + { + "epoch": 0.78, + "grad_norm": 1.202422642985484, + "learning_rate": 2.4382426478440925e-06, + "loss": 0.5096, + "step": 10051 + }, + { + "epoch": 0.78, + "grad_norm": 1.1573668575954223, + "learning_rate": 2.4365986735235183e-06, + "loss": 0.4958, + "step": 10052 + }, + { + "epoch": 0.78, + "grad_norm": 1.3252900777735628, + "learning_rate": 2.4349551767194914e-06, + "loss": 0.5182, + "step": 10053 + }, + { + "epoch": 0.78, + "grad_norm": 1.2774157489235207, + "learning_rate": 2.433312157535774e-06, + "loss": 0.5626, + "step": 10054 + }, + { + "epoch": 0.78, + "grad_norm": 1.1322958458639376, + "learning_rate": 2.431669616076101e-06, + "loss": 0.4618, + "step": 10055 + }, + { + "epoch": 0.78, + "grad_norm": 1.114822284836688, + "learning_rate": 2.430027552444174e-06, + "loss": 0.4799, + "step": 10056 + }, + { + "epoch": 0.78, + "grad_norm": 1.2591295159239135, + "learning_rate": 2.4283859667436615e-06, + "loss": 0.5261, + "step": 10057 + }, + { + "epoch": 0.78, + "grad_norm": 1.212643608948486, + "learning_rate": 2.4267448590782093e-06, + "loss": 0.5242, + "step": 10058 + }, + { + "epoch": 0.78, + "grad_norm": 1.1440619329947614, + "learning_rate": 2.425104229551425e-06, + "loss": 0.5239, + "step": 10059 + }, + { + "epoch": 0.78, + "grad_norm": 1.171990147337657, + "learning_rate": 2.4234640782668917e-06, + "loss": 0.5115, + "step": 10060 + }, + { + "epoch": 0.78, + "grad_norm": 1.2037450010328585, + "learning_rate": 2.4218244053281636e-06, + "loss": 0.5091, + "step": 10061 + }, + { + "epoch": 0.78, + "grad_norm": 1.1519383121338802, + "learning_rate": 2.420185210838756e-06, + "loss": 0.4638, + "step": 10062 + }, + { + "epoch": 0.78, + "grad_norm": 1.1667125159291576, + "learning_rate": 2.418546494902163e-06, + "loss": 0.4872, + "step": 10063 + }, + { + "epoch": 0.78, + "grad_norm": 1.3481308253502409, + "learning_rate": 2.416908257621845e-06, + "loss": 0.5743, + "step": 10064 + }, + { + "epoch": 0.78, + "grad_norm": 1.1603950229089637, + "learning_rate": 2.415270499101232e-06, + "loss": 0.521, + "step": 10065 + }, + { + "epoch": 0.78, + "grad_norm": 1.2866470826508272, + "learning_rate": 2.413633219443725e-06, + "loss": 0.5425, + "step": 10066 + }, + { + "epoch": 0.78, + "grad_norm": 1.1823853218974247, + "learning_rate": 2.411996418752696e-06, + "loss": 0.5001, + "step": 10067 + }, + { + "epoch": 0.78, + "grad_norm": 1.1507584869350262, + "learning_rate": 2.410360097131482e-06, + "loss": 0.4919, + "step": 10068 + }, + { + "epoch": 0.78, + "grad_norm": 1.1746519628427923, + "learning_rate": 2.4087242546833887e-06, + "loss": 0.5151, + "step": 10069 + }, + { + "epoch": 0.78, + "grad_norm": 1.1421706818100408, + "learning_rate": 2.407088891511701e-06, + "loss": 0.4504, + "step": 10070 + }, + { + "epoch": 0.78, + "grad_norm": 1.199424750810467, + "learning_rate": 2.4054540077196644e-06, + "loss": 0.4759, + "step": 10071 + }, + { + "epoch": 0.78, + "grad_norm": 1.1506963818987264, + "learning_rate": 2.403819603410502e-06, + "loss": 0.4707, + "step": 10072 + }, + { + "epoch": 0.78, + "grad_norm": 1.2772116869863952, + "learning_rate": 2.4021856786873964e-06, + "loss": 0.5053, + "step": 10073 + }, + { + "epoch": 0.78, + "grad_norm": 1.2042466421969442, + "learning_rate": 2.400552233653508e-06, + "loss": 0.4835, + "step": 10074 + }, + { + "epoch": 0.78, + "grad_norm": 1.2620288007324452, + "learning_rate": 2.398919268411967e-06, + "loss": 0.53, + "step": 10075 + }, + { + "epoch": 0.78, + "grad_norm": 1.144201758546483, + "learning_rate": 2.3972867830658665e-06, + "loss": 0.4688, + "step": 10076 + }, + { + "epoch": 0.78, + "grad_norm": 1.0487533912103724, + "learning_rate": 2.3956547777182805e-06, + "loss": 0.4783, + "step": 10077 + }, + { + "epoch": 0.78, + "grad_norm": 1.2807614860578087, + "learning_rate": 2.394023252472242e-06, + "loss": 0.5428, + "step": 10078 + }, + { + "epoch": 0.78, + "grad_norm": 1.1751886439566128, + "learning_rate": 2.392392207430754e-06, + "loss": 0.4952, + "step": 10079 + }, + { + "epoch": 0.78, + "grad_norm": 1.2347904423672806, + "learning_rate": 2.390761642696795e-06, + "loss": 0.5202, + "step": 10080 + }, + { + "epoch": 0.78, + "grad_norm": 1.1340833704171946, + "learning_rate": 2.3891315583733133e-06, + "loss": 0.5496, + "step": 10081 + }, + { + "epoch": 0.78, + "grad_norm": 1.347717151095056, + "learning_rate": 2.387501954563225e-06, + "loss": 0.5304, + "step": 10082 + }, + { + "epoch": 0.78, + "grad_norm": 1.235878957719505, + "learning_rate": 2.385872831369411e-06, + "loss": 0.4912, + "step": 10083 + }, + { + "epoch": 0.78, + "grad_norm": 1.2240811658326685, + "learning_rate": 2.38424418889473e-06, + "loss": 0.4682, + "step": 10084 + }, + { + "epoch": 0.78, + "grad_norm": 1.2674457968835273, + "learning_rate": 2.382616027242005e-06, + "loss": 0.5136, + "step": 10085 + }, + { + "epoch": 0.78, + "grad_norm": 1.2216633430442685, + "learning_rate": 2.3809883465140304e-06, + "loss": 0.461, + "step": 10086 + }, + { + "epoch": 0.78, + "grad_norm": 1.2392178913056242, + "learning_rate": 2.3793611468135734e-06, + "loss": 0.5249, + "step": 10087 + }, + { + "epoch": 0.78, + "grad_norm": 1.1419433395405967, + "learning_rate": 2.3777344282433645e-06, + "loss": 0.4885, + "step": 10088 + }, + { + "epoch": 0.78, + "grad_norm": 1.1360705123327532, + "learning_rate": 2.3761081909061036e-06, + "loss": 0.4726, + "step": 10089 + }, + { + "epoch": 0.78, + "grad_norm": 1.2173158080965867, + "learning_rate": 2.374482434904467e-06, + "loss": 0.5024, + "step": 10090 + }, + { + "epoch": 0.78, + "grad_norm": 1.2974416115472984, + "learning_rate": 2.372857160341098e-06, + "loss": 0.4977, + "step": 10091 + }, + { + "epoch": 0.78, + "grad_norm": 1.209564217838977, + "learning_rate": 2.371232367318609e-06, + "loss": 0.5027, + "step": 10092 + }, + { + "epoch": 0.78, + "grad_norm": 1.1404234303078846, + "learning_rate": 2.369608055939576e-06, + "loss": 0.464, + "step": 10093 + }, + { + "epoch": 0.78, + "grad_norm": 1.2176849895988486, + "learning_rate": 2.3679842263065554e-06, + "loss": 0.5141, + "step": 10094 + }, + { + "epoch": 0.78, + "grad_norm": 1.1304284099770816, + "learning_rate": 2.366360878522067e-06, + "loss": 0.5312, + "step": 10095 + }, + { + "epoch": 0.78, + "grad_norm": 1.2151739524208165, + "learning_rate": 2.3647380126886033e-06, + "loss": 0.4729, + "step": 10096 + }, + { + "epoch": 0.78, + "grad_norm": 1.0972838397906044, + "learning_rate": 2.363115628908619e-06, + "loss": 0.4701, + "step": 10097 + }, + { + "epoch": 0.78, + "grad_norm": 1.2570521739975058, + "learning_rate": 2.3614937272845484e-06, + "loss": 0.5217, + "step": 10098 + }, + { + "epoch": 0.78, + "grad_norm": 1.1563948463235374, + "learning_rate": 2.3598723079187848e-06, + "loss": 0.4791, + "step": 10099 + }, + { + "epoch": 0.78, + "grad_norm": 1.211298685660256, + "learning_rate": 2.358251370913701e-06, + "loss": 0.4732, + "step": 10100 + }, + { + "epoch": 0.78, + "grad_norm": 1.0808573941680326, + "learning_rate": 2.356630916371635e-06, + "loss": 0.5037, + "step": 10101 + }, + { + "epoch": 0.78, + "grad_norm": 1.2364363540114214, + "learning_rate": 2.3550109443948967e-06, + "loss": 0.501, + "step": 10102 + }, + { + "epoch": 0.78, + "grad_norm": 1.0159774755511717, + "learning_rate": 2.353391455085756e-06, + "loss": 0.4007, + "step": 10103 + }, + { + "epoch": 0.78, + "grad_norm": 1.1697519016499072, + "learning_rate": 2.3517724485464655e-06, + "loss": 0.4867, + "step": 10104 + }, + { + "epoch": 0.78, + "grad_norm": 1.248692639458742, + "learning_rate": 2.3501539248792406e-06, + "loss": 0.502, + "step": 10105 + }, + { + "epoch": 0.78, + "grad_norm": 1.1643580119213923, + "learning_rate": 2.348535884186267e-06, + "loss": 0.4728, + "step": 10106 + }, + { + "epoch": 0.78, + "grad_norm": 1.188584585303675, + "learning_rate": 2.3469183265696984e-06, + "loss": 0.4422, + "step": 10107 + }, + { + "epoch": 0.78, + "grad_norm": 1.2102148545458833, + "learning_rate": 2.3453012521316633e-06, + "loss": 0.4905, + "step": 10108 + }, + { + "epoch": 0.78, + "grad_norm": 1.209539184465441, + "learning_rate": 2.343684660974249e-06, + "loss": 0.5305, + "step": 10109 + }, + { + "epoch": 0.78, + "grad_norm": 1.075547828366324, + "learning_rate": 2.3420685531995247e-06, + "loss": 0.4604, + "step": 10110 + }, + { + "epoch": 0.78, + "grad_norm": 1.1997112045867584, + "learning_rate": 2.340452928909522e-06, + "loss": 0.5389, + "step": 10111 + }, + { + "epoch": 0.78, + "grad_norm": 1.2543143205549179, + "learning_rate": 2.3388377882062472e-06, + "loss": 0.5065, + "step": 10112 + }, + { + "epoch": 0.78, + "grad_norm": 1.1929581337526975, + "learning_rate": 2.337223131191666e-06, + "loss": 0.4766, + "step": 10113 + }, + { + "epoch": 0.78, + "grad_norm": 1.1575132634533807, + "learning_rate": 2.335608957967723e-06, + "loss": 0.4926, + "step": 10114 + }, + { + "epoch": 0.78, + "grad_norm": 1.3458825422252627, + "learning_rate": 2.33399526863633e-06, + "loss": 0.5577, + "step": 10115 + }, + { + "epoch": 0.78, + "grad_norm": 1.1670118513804872, + "learning_rate": 2.33238206329937e-06, + "loss": 0.4899, + "step": 10116 + }, + { + "epoch": 0.78, + "grad_norm": 1.1514919947827544, + "learning_rate": 2.3307693420586873e-06, + "loss": 0.4623, + "step": 10117 + }, + { + "epoch": 0.78, + "grad_norm": 1.1456185341843501, + "learning_rate": 2.3291571050161066e-06, + "loss": 0.4777, + "step": 10118 + }, + { + "epoch": 0.79, + "grad_norm": 1.1668270999862007, + "learning_rate": 2.3275453522734116e-06, + "loss": 0.5206, + "step": 10119 + }, + { + "epoch": 0.79, + "grad_norm": 1.2782400643708445, + "learning_rate": 2.325934083932362e-06, + "loss": 0.5409, + "step": 10120 + }, + { + "epoch": 0.79, + "grad_norm": 1.11572265625, + "learning_rate": 2.3243233000946874e-06, + "loss": 0.4567, + "step": 10121 + }, + { + "epoch": 0.79, + "grad_norm": 1.1744299946688732, + "learning_rate": 2.3227130008620847e-06, + "loss": 0.4407, + "step": 10122 + }, + { + "epoch": 0.79, + "grad_norm": 1.2944843824762486, + "learning_rate": 2.321103186336222e-06, + "loss": 0.5814, + "step": 10123 + }, + { + "epoch": 0.79, + "grad_norm": 1.3216213984071001, + "learning_rate": 2.319493856618731e-06, + "loss": 0.5074, + "step": 10124 + }, + { + "epoch": 0.79, + "grad_norm": 1.1083881663380104, + "learning_rate": 2.3178850118112185e-06, + "loss": 0.4991, + "step": 10125 + }, + { + "epoch": 0.79, + "grad_norm": 1.3344712269590773, + "learning_rate": 2.3162766520152624e-06, + "loss": 0.5435, + "step": 10126 + }, + { + "epoch": 0.79, + "grad_norm": 1.136803482935145, + "learning_rate": 2.3146687773324017e-06, + "loss": 0.4609, + "step": 10127 + }, + { + "epoch": 0.79, + "grad_norm": 1.1339425074318483, + "learning_rate": 2.313061387864155e-06, + "loss": 0.4937, + "step": 10128 + }, + { + "epoch": 0.79, + "grad_norm": 1.2276166528717671, + "learning_rate": 2.311454483711999e-06, + "loss": 0.466, + "step": 10129 + }, + { + "epoch": 0.79, + "grad_norm": 1.2216988614633686, + "learning_rate": 2.309848064977389e-06, + "loss": 0.5624, + "step": 10130 + }, + { + "epoch": 0.79, + "grad_norm": 1.1410175327388248, + "learning_rate": 2.3082421317617463e-06, + "loss": 0.5206, + "step": 10131 + }, + { + "epoch": 0.79, + "grad_norm": 1.1642994991442626, + "learning_rate": 2.3066366841664633e-06, + "loss": 0.5073, + "step": 10132 + }, + { + "epoch": 0.79, + "grad_norm": 1.2936334658858177, + "learning_rate": 2.3050317222929006e-06, + "loss": 0.4881, + "step": 10133 + }, + { + "epoch": 0.79, + "grad_norm": 1.218495611393429, + "learning_rate": 2.3034272462423846e-06, + "loss": 0.4935, + "step": 10134 + }, + { + "epoch": 0.79, + "grad_norm": 1.191787958752573, + "learning_rate": 2.3018232561162144e-06, + "loss": 0.5166, + "step": 10135 + }, + { + "epoch": 0.79, + "grad_norm": 1.2516170532681377, + "learning_rate": 2.3002197520156634e-06, + "loss": 0.5487, + "step": 10136 + }, + { + "epoch": 0.79, + "grad_norm": 1.0504019240469935, + "learning_rate": 2.2986167340419606e-06, + "loss": 0.472, + "step": 10137 + }, + { + "epoch": 0.79, + "grad_norm": 1.1487923936627922, + "learning_rate": 2.2970142022963216e-06, + "loss": 0.4904, + "step": 10138 + }, + { + "epoch": 0.79, + "grad_norm": 1.2419663718273535, + "learning_rate": 2.295412156879915e-06, + "loss": 0.5323, + "step": 10139 + }, + { + "epoch": 0.79, + "grad_norm": 1.2529952403021525, + "learning_rate": 2.2938105978938897e-06, + "loss": 0.5181, + "step": 10140 + }, + { + "epoch": 0.79, + "grad_norm": 1.1726901461831587, + "learning_rate": 2.2922095254393594e-06, + "loss": 0.4811, + "step": 10141 + }, + { + "epoch": 0.79, + "grad_norm": 1.2230176787545148, + "learning_rate": 2.290608939617408e-06, + "loss": 0.5283, + "step": 10142 + }, + { + "epoch": 0.79, + "grad_norm": 1.2528957204756141, + "learning_rate": 2.2890088405290935e-06, + "loss": 0.4803, + "step": 10143 + }, + { + "epoch": 0.79, + "grad_norm": 1.1980383216893382, + "learning_rate": 2.2874092282754313e-06, + "loss": 0.5198, + "step": 10144 + }, + { + "epoch": 0.79, + "grad_norm": 1.1562906722055037, + "learning_rate": 2.2858101029574164e-06, + "loss": 0.4644, + "step": 10145 + }, + { + "epoch": 0.79, + "grad_norm": 1.1054367761739514, + "learning_rate": 2.284211464676013e-06, + "loss": 0.4602, + "step": 10146 + }, + { + "epoch": 0.79, + "grad_norm": 1.1415859511770745, + "learning_rate": 2.2826133135321437e-06, + "loss": 0.4697, + "step": 10147 + }, + { + "epoch": 0.79, + "grad_norm": 1.2473425750937144, + "learning_rate": 2.2810156496267165e-06, + "loss": 0.5214, + "step": 10148 + }, + { + "epoch": 0.79, + "grad_norm": 1.2849709977316985, + "learning_rate": 2.2794184730605926e-06, + "loss": 0.5002, + "step": 10149 + }, + { + "epoch": 0.79, + "grad_norm": 1.230826867684282, + "learning_rate": 2.277821783934614e-06, + "loss": 0.5676, + "step": 10150 + }, + { + "epoch": 0.79, + "grad_norm": 1.1926225300983107, + "learning_rate": 2.276225582349587e-06, + "loss": 0.4976, + "step": 10151 + }, + { + "epoch": 0.79, + "grad_norm": 1.2403045874718333, + "learning_rate": 2.2746298684062884e-06, + "loss": 0.5158, + "step": 10152 + }, + { + "epoch": 0.79, + "grad_norm": 1.2616096662010758, + "learning_rate": 2.273034642205467e-06, + "loss": 0.5316, + "step": 10153 + }, + { + "epoch": 0.79, + "grad_norm": 1.1504857488658893, + "learning_rate": 2.2714399038478317e-06, + "loss": 0.4866, + "step": 10154 + }, + { + "epoch": 0.79, + "grad_norm": 1.1106666245439514, + "learning_rate": 2.2698456534340694e-06, + "loss": 0.4931, + "step": 10155 + }, + { + "epoch": 0.79, + "grad_norm": 1.2648908110375974, + "learning_rate": 2.2682518910648353e-06, + "loss": 0.5406, + "step": 10156 + }, + { + "epoch": 0.79, + "grad_norm": 1.0745793517621873, + "learning_rate": 2.266658616840748e-06, + "loss": 0.4133, + "step": 10157 + }, + { + "epoch": 0.79, + "grad_norm": 1.27854405740696, + "learning_rate": 2.265065830862404e-06, + "loss": 0.5401, + "step": 10158 + }, + { + "epoch": 0.79, + "grad_norm": 1.3219401692948791, + "learning_rate": 2.2634735332303583e-06, + "loss": 0.5505, + "step": 10159 + }, + { + "epoch": 0.79, + "grad_norm": 1.2212095141069266, + "learning_rate": 2.261881724045143e-06, + "loss": 0.4825, + "step": 10160 + }, + { + "epoch": 0.79, + "grad_norm": 1.091805118536494, + "learning_rate": 2.2602904034072583e-06, + "loss": 0.4823, + "step": 10161 + }, + { + "epoch": 0.79, + "grad_norm": 1.2066577716274733, + "learning_rate": 2.2586995714171712e-06, + "loss": 0.5196, + "step": 10162 + }, + { + "epoch": 0.79, + "grad_norm": 1.2136639327423837, + "learning_rate": 2.257109228175324e-06, + "loss": 0.5029, + "step": 10163 + }, + { + "epoch": 0.79, + "grad_norm": 1.1878321333750181, + "learning_rate": 2.2555193737821156e-06, + "loss": 0.5324, + "step": 10164 + }, + { + "epoch": 0.79, + "grad_norm": 1.3371091021872668, + "learning_rate": 2.253930008337927e-06, + "loss": 0.5146, + "step": 10165 + }, + { + "epoch": 0.79, + "grad_norm": 1.2784012549574402, + "learning_rate": 2.252341131943102e-06, + "loss": 0.5207, + "step": 10166 + }, + { + "epoch": 0.79, + "grad_norm": 1.1155664914814352, + "learning_rate": 2.250752744697953e-06, + "loss": 0.541, + "step": 10167 + }, + { + "epoch": 0.79, + "grad_norm": 1.193134892354234, + "learning_rate": 2.249164846702766e-06, + "loss": 0.4344, + "step": 10168 + }, + { + "epoch": 0.79, + "grad_norm": 1.227594852317764, + "learning_rate": 2.247577438057789e-06, + "loss": 0.5493, + "step": 10169 + }, + { + "epoch": 0.79, + "grad_norm": 1.2751448249842616, + "learning_rate": 2.2459905188632446e-06, + "loss": 0.5256, + "step": 10170 + }, + { + "epoch": 0.79, + "grad_norm": 1.2167122261888477, + "learning_rate": 2.244404089219325e-06, + "loss": 0.5444, + "step": 10171 + }, + { + "epoch": 0.79, + "grad_norm": 1.2273949392074224, + "learning_rate": 2.242818149226189e-06, + "loss": 0.5473, + "step": 10172 + }, + { + "epoch": 0.79, + "grad_norm": 1.2131404895766427, + "learning_rate": 2.2412326989839673e-06, + "loss": 0.489, + "step": 10173 + }, + { + "epoch": 0.79, + "grad_norm": 1.165382046986016, + "learning_rate": 2.2396477385927527e-06, + "loss": 0.4718, + "step": 10174 + }, + { + "epoch": 0.79, + "grad_norm": 1.4393539083958824, + "learning_rate": 2.238063268152615e-06, + "loss": 0.5621, + "step": 10175 + }, + { + "epoch": 0.79, + "grad_norm": 1.1952756489577088, + "learning_rate": 2.236479287763591e-06, + "loss": 0.5411, + "step": 10176 + }, + { + "epoch": 0.79, + "grad_norm": 1.2702662761015613, + "learning_rate": 2.2348957975256825e-06, + "loss": 0.52, + "step": 10177 + }, + { + "epoch": 0.79, + "grad_norm": 1.2513012788461038, + "learning_rate": 2.2333127975388646e-06, + "loss": 0.5292, + "step": 10178 + }, + { + "epoch": 0.79, + "grad_norm": 1.2521675866459296, + "learning_rate": 2.2317302879030833e-06, + "loss": 0.5621, + "step": 10179 + }, + { + "epoch": 0.79, + "grad_norm": 1.13786158840586, + "learning_rate": 2.230148268718245e-06, + "loss": 0.4824, + "step": 10180 + }, + { + "epoch": 0.79, + "grad_norm": 1.173078046620181, + "learning_rate": 2.228566740084234e-06, + "loss": 0.4786, + "step": 10181 + }, + { + "epoch": 0.79, + "grad_norm": 1.1558363406511902, + "learning_rate": 2.2269857021009e-06, + "loss": 0.5146, + "step": 10182 + }, + { + "epoch": 0.79, + "grad_norm": 1.1292032029456813, + "learning_rate": 2.225405154868062e-06, + "loss": 0.5, + "step": 10183 + }, + { + "epoch": 0.79, + "grad_norm": 1.1289229342273814, + "learning_rate": 2.2238250984855112e-06, + "loss": 0.4808, + "step": 10184 + }, + { + "epoch": 0.79, + "grad_norm": 1.2538177839308264, + "learning_rate": 2.2222455330529993e-06, + "loss": 0.5224, + "step": 10185 + }, + { + "epoch": 0.79, + "grad_norm": 1.3138769737753615, + "learning_rate": 2.2206664586702566e-06, + "loss": 0.5375, + "step": 10186 + }, + { + "epoch": 0.79, + "grad_norm": 1.1931142602100457, + "learning_rate": 2.219087875436974e-06, + "loss": 0.4765, + "step": 10187 + }, + { + "epoch": 0.79, + "grad_norm": 1.145986211577279, + "learning_rate": 2.2175097834528183e-06, + "loss": 0.4783, + "step": 10188 + }, + { + "epoch": 0.79, + "grad_norm": 1.100529883378034, + "learning_rate": 2.2159321828174252e-06, + "loss": 0.4952, + "step": 10189 + }, + { + "epoch": 0.79, + "grad_norm": 1.1984984878479574, + "learning_rate": 2.214355073630391e-06, + "loss": 0.464, + "step": 10190 + }, + { + "epoch": 0.79, + "grad_norm": 1.195610408185424, + "learning_rate": 2.212778455991289e-06, + "loss": 0.4916, + "step": 10191 + }, + { + "epoch": 0.79, + "grad_norm": 1.2169761464422881, + "learning_rate": 2.211202329999661e-06, + "loss": 0.4983, + "step": 10192 + }, + { + "epoch": 0.79, + "grad_norm": 1.2559834800137961, + "learning_rate": 2.2096266957550138e-06, + "loss": 0.5574, + "step": 10193 + }, + { + "epoch": 0.79, + "grad_norm": 1.1271198750963534, + "learning_rate": 2.208051553356829e-06, + "loss": 0.4698, + "step": 10194 + }, + { + "epoch": 0.79, + "grad_norm": 1.1662122091076532, + "learning_rate": 2.2064769029045497e-06, + "loss": 0.497, + "step": 10195 + }, + { + "epoch": 0.79, + "grad_norm": 1.2296533698613163, + "learning_rate": 2.2049027444975934e-06, + "loss": 0.5144, + "step": 10196 + }, + { + "epoch": 0.79, + "grad_norm": 1.2364310030446166, + "learning_rate": 2.2033290782353434e-06, + "loss": 0.5392, + "step": 10197 + }, + { + "epoch": 0.79, + "grad_norm": 1.1779016277935124, + "learning_rate": 2.2017559042171534e-06, + "loss": 0.5042, + "step": 10198 + }, + { + "epoch": 0.79, + "grad_norm": 1.2021413968683448, + "learning_rate": 2.2001832225423493e-06, + "loss": 0.5654, + "step": 10199 + }, + { + "epoch": 0.79, + "grad_norm": 1.1834056440386747, + "learning_rate": 2.1986110333102175e-06, + "loss": 0.4727, + "step": 10200 + }, + { + "epoch": 0.79, + "grad_norm": 1.331240198161974, + "learning_rate": 2.1970393366200216e-06, + "loss": 0.479, + "step": 10201 + }, + { + "epoch": 0.79, + "grad_norm": 1.1756101971778818, + "learning_rate": 2.195468132570989e-06, + "loss": 0.4601, + "step": 10202 + }, + { + "epoch": 0.79, + "grad_norm": 1.0827746540187844, + "learning_rate": 2.193897421262321e-06, + "loss": 0.4897, + "step": 10203 + }, + { + "epoch": 0.79, + "grad_norm": 1.1487147716645416, + "learning_rate": 2.1923272027931853e-06, + "loss": 0.4507, + "step": 10204 + }, + { + "epoch": 0.79, + "grad_norm": 1.2662310620894188, + "learning_rate": 2.1907574772627116e-06, + "loss": 0.531, + "step": 10205 + }, + { + "epoch": 0.79, + "grad_norm": 1.2257299331027671, + "learning_rate": 2.189188244770013e-06, + "loss": 0.4869, + "step": 10206 + }, + { + "epoch": 0.79, + "grad_norm": 1.1055489770468852, + "learning_rate": 2.187619505414156e-06, + "loss": 0.5016, + "step": 10207 + }, + { + "epoch": 0.79, + "grad_norm": 1.2545303265283958, + "learning_rate": 2.186051259294185e-06, + "loss": 0.52, + "step": 10208 + }, + { + "epoch": 0.79, + "grad_norm": 1.1895532424960171, + "learning_rate": 2.1844835065091165e-06, + "loss": 0.5058, + "step": 10209 + }, + { + "epoch": 0.79, + "grad_norm": 1.190883634415636, + "learning_rate": 2.1829162471579234e-06, + "loss": 0.5163, + "step": 10210 + }, + { + "epoch": 0.79, + "grad_norm": 1.0893931123734641, + "learning_rate": 2.18134948133956e-06, + "loss": 0.4837, + "step": 10211 + }, + { + "epoch": 0.79, + "grad_norm": 1.1292518166017984, + "learning_rate": 2.1797832091529414e-06, + "loss": 0.5445, + "step": 10212 + }, + { + "epoch": 0.79, + "grad_norm": 1.1243821142712682, + "learning_rate": 2.178217430696956e-06, + "loss": 0.5161, + "step": 10213 + }, + { + "epoch": 0.79, + "grad_norm": 1.0830331838953589, + "learning_rate": 2.1766521460704627e-06, + "loss": 0.4431, + "step": 10214 + }, + { + "epoch": 0.79, + "grad_norm": 1.2119092918136107, + "learning_rate": 2.1750873553722796e-06, + "loss": 0.4775, + "step": 10215 + }, + { + "epoch": 0.79, + "grad_norm": 1.0401988305247927, + "learning_rate": 2.1735230587012057e-06, + "loss": 0.4159, + "step": 10216 + }, + { + "epoch": 0.79, + "grad_norm": 1.318567737079112, + "learning_rate": 2.1719592561559978e-06, + "loss": 0.5214, + "step": 10217 + }, + { + "epoch": 0.79, + "grad_norm": 1.0961440995847413, + "learning_rate": 2.1703959478353886e-06, + "loss": 0.4706, + "step": 10218 + }, + { + "epoch": 0.79, + "grad_norm": 1.2243958793600707, + "learning_rate": 2.168833133838082e-06, + "loss": 0.4916, + "step": 10219 + }, + { + "epoch": 0.79, + "grad_norm": 1.2505341818956475, + "learning_rate": 2.167270814262741e-06, + "loss": 0.4605, + "step": 10220 + }, + { + "epoch": 0.79, + "grad_norm": 1.3083759639055283, + "learning_rate": 2.165708989208004e-06, + "loss": 0.4989, + "step": 10221 + }, + { + "epoch": 0.79, + "grad_norm": 1.321453932846483, + "learning_rate": 2.1641476587724784e-06, + "loss": 0.5485, + "step": 10222 + }, + { + "epoch": 0.79, + "grad_norm": 1.3265343958525244, + "learning_rate": 2.16258682305474e-06, + "loss": 0.5625, + "step": 10223 + }, + { + "epoch": 0.79, + "grad_norm": 1.1665851984643831, + "learning_rate": 2.1610264821533323e-06, + "loss": 0.5387, + "step": 10224 + }, + { + "epoch": 0.79, + "grad_norm": 1.1498525296899968, + "learning_rate": 2.159466636166765e-06, + "loss": 0.508, + "step": 10225 + }, + { + "epoch": 0.79, + "grad_norm": 1.2169015511788683, + "learning_rate": 2.1579072851935222e-06, + "loss": 0.5239, + "step": 10226 + }, + { + "epoch": 0.79, + "grad_norm": 1.3006873111197723, + "learning_rate": 2.1563484293320503e-06, + "loss": 0.5655, + "step": 10227 + }, + { + "epoch": 0.79, + "grad_norm": 1.2583264077019156, + "learning_rate": 2.154790068680771e-06, + "loss": 0.4888, + "step": 10228 + }, + { + "epoch": 0.79, + "grad_norm": 1.3641088209119965, + "learning_rate": 2.1532322033380725e-06, + "loss": 0.5536, + "step": 10229 + }, + { + "epoch": 0.79, + "grad_norm": 1.2896213591623045, + "learning_rate": 2.1516748334023065e-06, + "loss": 0.5742, + "step": 10230 + }, + { + "epoch": 0.79, + "grad_norm": 1.266656996455841, + "learning_rate": 2.1501179589717993e-06, + "loss": 0.558, + "step": 10231 + }, + { + "epoch": 0.79, + "grad_norm": 1.23049209814778, + "learning_rate": 2.148561580144847e-06, + "loss": 0.5469, + "step": 10232 + }, + { + "epoch": 0.79, + "grad_norm": 1.083509461439479, + "learning_rate": 2.1470056970197085e-06, + "loss": 0.489, + "step": 10233 + }, + { + "epoch": 0.79, + "grad_norm": 1.1973030619656757, + "learning_rate": 2.14545030969462e-06, + "loss": 0.5326, + "step": 10234 + }, + { + "epoch": 0.79, + "grad_norm": 1.1525056369668465, + "learning_rate": 2.143895418267775e-06, + "loss": 0.532, + "step": 10235 + }, + { + "epoch": 0.79, + "grad_norm": 1.2896594890402306, + "learning_rate": 2.1423410228373477e-06, + "loss": 0.5312, + "step": 10236 + }, + { + "epoch": 0.79, + "grad_norm": 1.1524543838410315, + "learning_rate": 2.1407871235014675e-06, + "loss": 0.4639, + "step": 10237 + }, + { + "epoch": 0.79, + "grad_norm": 1.3704034094412325, + "learning_rate": 2.139233720358246e-06, + "loss": 0.5722, + "step": 10238 + }, + { + "epoch": 0.79, + "grad_norm": 1.2197110714932773, + "learning_rate": 2.137680813505756e-06, + "loss": 0.5238, + "step": 10239 + }, + { + "epoch": 0.79, + "grad_norm": 1.23459961211722, + "learning_rate": 2.1361284030420416e-06, + "loss": 0.5142, + "step": 10240 + }, + { + "epoch": 0.79, + "grad_norm": 1.1625394650395113, + "learning_rate": 2.1345764890651123e-06, + "loss": 0.4782, + "step": 10241 + }, + { + "epoch": 0.79, + "grad_norm": 1.232935440257451, + "learning_rate": 2.1330250716729484e-06, + "loss": 0.4937, + "step": 10242 + }, + { + "epoch": 0.79, + "grad_norm": 1.0461478483585889, + "learning_rate": 2.1314741509635007e-06, + "loss": 0.4287, + "step": 10243 + }, + { + "epoch": 0.79, + "grad_norm": 1.117240317636678, + "learning_rate": 2.1299237270346885e-06, + "loss": 0.4419, + "step": 10244 + }, + { + "epoch": 0.79, + "grad_norm": 1.0641729983681847, + "learning_rate": 2.1283737999843922e-06, + "loss": 0.4996, + "step": 10245 + }, + { + "epoch": 0.79, + "grad_norm": 1.2339668081880149, + "learning_rate": 2.126824369910474e-06, + "loss": 0.5177, + "step": 10246 + }, + { + "epoch": 0.79, + "grad_norm": 1.0542395311486665, + "learning_rate": 2.125275436910751e-06, + "loss": 0.4682, + "step": 10247 + }, + { + "epoch": 0.8, + "grad_norm": 1.286338332083872, + "learning_rate": 2.123727001083017e-06, + "loss": 0.5072, + "step": 10248 + }, + { + "epoch": 0.8, + "grad_norm": 1.1217494203631044, + "learning_rate": 2.1221790625250336e-06, + "loss": 0.477, + "step": 10249 + }, + { + "epoch": 0.8, + "grad_norm": 1.1588988506260418, + "learning_rate": 2.1206316213345334e-06, + "loss": 0.5209, + "step": 10250 + }, + { + "epoch": 0.8, + "grad_norm": 1.3158661048308031, + "learning_rate": 2.119084677609209e-06, + "loss": 0.4853, + "step": 10251 + }, + { + "epoch": 0.8, + "grad_norm": 1.2928356090579884, + "learning_rate": 2.1175382314467285e-06, + "loss": 0.4804, + "step": 10252 + }, + { + "epoch": 0.8, + "grad_norm": 1.230745653754229, + "learning_rate": 2.1159922829447267e-06, + "loss": 0.4912, + "step": 10253 + }, + { + "epoch": 0.8, + "grad_norm": 1.186337956503034, + "learning_rate": 2.1144468322008125e-06, + "loss": 0.5167, + "step": 10254 + }, + { + "epoch": 0.8, + "grad_norm": 1.1992578655412016, + "learning_rate": 2.11290187931255e-06, + "loss": 0.5538, + "step": 10255 + }, + { + "epoch": 0.8, + "grad_norm": 1.1897909752606575, + "learning_rate": 2.111357424377487e-06, + "loss": 0.5504, + "step": 10256 + }, + { + "epoch": 0.8, + "grad_norm": 1.3010672406670374, + "learning_rate": 2.109813467493127e-06, + "loss": 0.5576, + "step": 10257 + }, + { + "epoch": 0.8, + "grad_norm": 1.2899843560977982, + "learning_rate": 2.1082700087569517e-06, + "loss": 0.585, + "step": 10258 + }, + { + "epoch": 0.8, + "grad_norm": 1.2554375160951472, + "learning_rate": 2.106727048266406e-06, + "loss": 0.5399, + "step": 10259 + }, + { + "epoch": 0.8, + "grad_norm": 1.2978699326201804, + "learning_rate": 2.105184586118908e-06, + "loss": 0.5437, + "step": 10260 + }, + { + "epoch": 0.8, + "grad_norm": 1.1338721745114873, + "learning_rate": 2.1036426224118366e-06, + "loss": 0.4785, + "step": 10261 + }, + { + "epoch": 0.8, + "grad_norm": 1.2578823473000358, + "learning_rate": 2.1021011572425466e-06, + "loss": 0.4409, + "step": 10262 + }, + { + "epoch": 0.8, + "grad_norm": 1.1481951503265189, + "learning_rate": 2.1005601907083585e-06, + "loss": 0.4557, + "step": 10263 + }, + { + "epoch": 0.8, + "grad_norm": 1.1707571674610573, + "learning_rate": 2.0990197229065636e-06, + "loss": 0.4925, + "step": 10264 + }, + { + "epoch": 0.8, + "grad_norm": 1.3843083453869778, + "learning_rate": 2.097479753934415e-06, + "loss": 0.526, + "step": 10265 + }, + { + "epoch": 0.8, + "grad_norm": 1.0259899894493696, + "learning_rate": 2.095940283889143e-06, + "loss": 0.4505, + "step": 10266 + }, + { + "epoch": 0.8, + "grad_norm": 1.3241290846993647, + "learning_rate": 2.0944013128679385e-06, + "loss": 0.5724, + "step": 10267 + }, + { + "epoch": 0.8, + "grad_norm": 1.1714965209127777, + "learning_rate": 2.092862840967966e-06, + "loss": 0.5172, + "step": 10268 + }, + { + "epoch": 0.8, + "grad_norm": 1.181992508378791, + "learning_rate": 2.0913248682863583e-06, + "loss": 0.4796, + "step": 10269 + }, + { + "epoch": 0.8, + "grad_norm": 1.1627365343375886, + "learning_rate": 2.0897873949202175e-06, + "loss": 0.53, + "step": 10270 + }, + { + "epoch": 0.8, + "grad_norm": 1.1728276259038721, + "learning_rate": 2.088250420966608e-06, + "loss": 0.4762, + "step": 10271 + }, + { + "epoch": 0.8, + "grad_norm": 1.196731224089679, + "learning_rate": 2.086713946522567e-06, + "loss": 0.5219, + "step": 10272 + }, + { + "epoch": 0.8, + "grad_norm": 1.1620583402823577, + "learning_rate": 2.085177971685103e-06, + "loss": 0.5089, + "step": 10273 + }, + { + "epoch": 0.8, + "grad_norm": 1.1714205560094304, + "learning_rate": 2.0836424965511913e-06, + "loss": 0.4839, + "step": 10274 + }, + { + "epoch": 0.8, + "grad_norm": 1.1311805282252312, + "learning_rate": 2.082107521217769e-06, + "loss": 0.456, + "step": 10275 + }, + { + "epoch": 0.8, + "grad_norm": 1.253755221674534, + "learning_rate": 2.080573045781753e-06, + "loss": 0.4884, + "step": 10276 + }, + { + "epoch": 0.8, + "grad_norm": 1.0926340677330009, + "learning_rate": 2.0790390703400164e-06, + "loss": 0.4614, + "step": 10277 + }, + { + "epoch": 0.8, + "grad_norm": 1.240784050189275, + "learning_rate": 2.0775055949894096e-06, + "loss": 0.5123, + "step": 10278 + }, + { + "epoch": 0.8, + "grad_norm": 1.2754458171591807, + "learning_rate": 2.0759726198267495e-06, + "loss": 0.535, + "step": 10279 + }, + { + "epoch": 0.8, + "grad_norm": 1.2359064959122852, + "learning_rate": 2.0744401449488238e-06, + "loss": 0.5527, + "step": 10280 + }, + { + "epoch": 0.8, + "grad_norm": 1.095415482189708, + "learning_rate": 2.072908170452379e-06, + "loss": 0.4655, + "step": 10281 + }, + { + "epoch": 0.8, + "grad_norm": 1.235275278261656, + "learning_rate": 2.07137669643414e-06, + "loss": 0.5061, + "step": 10282 + }, + { + "epoch": 0.8, + "grad_norm": 1.1767587748796635, + "learning_rate": 2.0698457229907966e-06, + "loss": 0.4943, + "step": 10283 + }, + { + "epoch": 0.8, + "grad_norm": 1.1228389647051331, + "learning_rate": 2.0683152502190095e-06, + "loss": 0.5053, + "step": 10284 + }, + { + "epoch": 0.8, + "grad_norm": 1.1843336958715984, + "learning_rate": 2.0667852782153996e-06, + "loss": 0.4801, + "step": 10285 + }, + { + "epoch": 0.8, + "grad_norm": 1.2250850064113754, + "learning_rate": 2.0652558070765682e-06, + "loss": 0.5531, + "step": 10286 + }, + { + "epoch": 0.8, + "grad_norm": 1.1554970738445292, + "learning_rate": 2.0637268368990727e-06, + "loss": 0.4522, + "step": 10287 + }, + { + "epoch": 0.8, + "grad_norm": 1.1650304674085714, + "learning_rate": 2.0621983677794486e-06, + "loss": 0.5014, + "step": 10288 + }, + { + "epoch": 0.8, + "grad_norm": 1.2780542293439354, + "learning_rate": 2.0606703998141942e-06, + "loss": 0.501, + "step": 10289 + }, + { + "epoch": 0.8, + "grad_norm": 1.1134729588397931, + "learning_rate": 2.0591429330997793e-06, + "loss": 0.465, + "step": 10290 + }, + { + "epoch": 0.8, + "grad_norm": 1.095193836352529, + "learning_rate": 2.0576159677326437e-06, + "loss": 0.468, + "step": 10291 + }, + { + "epoch": 0.8, + "grad_norm": 1.2125889008169002, + "learning_rate": 2.0560895038091865e-06, + "loss": 0.4905, + "step": 10292 + }, + { + "epoch": 0.8, + "grad_norm": 1.2807207177333237, + "learning_rate": 2.0545635414257835e-06, + "loss": 0.5412, + "step": 10293 + }, + { + "epoch": 0.8, + "grad_norm": 1.1189182986039712, + "learning_rate": 2.053038080678781e-06, + "loss": 0.4888, + "step": 10294 + }, + { + "epoch": 0.8, + "grad_norm": 1.2092515098746894, + "learning_rate": 2.051513121664481e-06, + "loss": 0.5595, + "step": 10295 + }, + { + "epoch": 0.8, + "grad_norm": 1.3039500727983295, + "learning_rate": 2.04998866447917e-06, + "loss": 0.5679, + "step": 10296 + }, + { + "epoch": 0.8, + "grad_norm": 1.185788276139838, + "learning_rate": 2.048464709219089e-06, + "loss": 0.5664, + "step": 10297 + }, + { + "epoch": 0.8, + "grad_norm": 1.1052751674931791, + "learning_rate": 2.046941255980456e-06, + "loss": 0.5147, + "step": 10298 + }, + { + "epoch": 0.8, + "grad_norm": 1.33807064432412, + "learning_rate": 2.0454183048594524e-06, + "loss": 0.5219, + "step": 10299 + }, + { + "epoch": 0.8, + "grad_norm": 1.2498117782026792, + "learning_rate": 2.0438958559522314e-06, + "loss": 0.5476, + "step": 10300 + }, + { + "epoch": 0.8, + "grad_norm": 1.2121789290854608, + "learning_rate": 2.042373909354917e-06, + "loss": 0.4863, + "step": 10301 + }, + { + "epoch": 0.8, + "grad_norm": 1.2257827418399094, + "learning_rate": 2.040852465163591e-06, + "loss": 0.4721, + "step": 10302 + }, + { + "epoch": 0.8, + "grad_norm": 1.2895661728332928, + "learning_rate": 2.0393315234743116e-06, + "loss": 0.4944, + "step": 10303 + }, + { + "epoch": 0.8, + "grad_norm": 1.182525152378456, + "learning_rate": 2.0378110843831077e-06, + "loss": 0.53, + "step": 10304 + }, + { + "epoch": 0.8, + "grad_norm": 1.2645572349624634, + "learning_rate": 2.036291147985967e-06, + "loss": 0.5323, + "step": 10305 + }, + { + "epoch": 0.8, + "grad_norm": 1.1360488962913127, + "learning_rate": 2.034771714378857e-06, + "loss": 0.4909, + "step": 10306 + }, + { + "epoch": 0.8, + "grad_norm": 1.1664564363346435, + "learning_rate": 2.0332527836577e-06, + "loss": 0.5156, + "step": 10307 + }, + { + "epoch": 0.8, + "grad_norm": 1.1656944561021052, + "learning_rate": 2.031734355918399e-06, + "loss": 0.5147, + "step": 10308 + }, + { + "epoch": 0.8, + "grad_norm": 1.2040978498935313, + "learning_rate": 2.0302164312568175e-06, + "loss": 0.4824, + "step": 10309 + }, + { + "epoch": 0.8, + "grad_norm": 1.0852949587972098, + "learning_rate": 2.028699009768792e-06, + "loss": 0.4967, + "step": 10310 + }, + { + "epoch": 0.8, + "grad_norm": 1.133489367084751, + "learning_rate": 2.0271820915501273e-06, + "loss": 0.4768, + "step": 10311 + }, + { + "epoch": 0.8, + "grad_norm": 1.2591906751055966, + "learning_rate": 2.025665676696589e-06, + "loss": 0.5354, + "step": 10312 + }, + { + "epoch": 0.8, + "grad_norm": 1.2247059371818034, + "learning_rate": 2.0241497653039178e-06, + "loss": 0.5186, + "step": 10313 + }, + { + "epoch": 0.8, + "grad_norm": 1.214661710292141, + "learning_rate": 2.0226343574678255e-06, + "loss": 0.514, + "step": 10314 + }, + { + "epoch": 0.8, + "grad_norm": 1.1841975519518038, + "learning_rate": 2.0211194532839807e-06, + "loss": 0.5078, + "step": 10315 + }, + { + "epoch": 0.8, + "grad_norm": 1.1738868736760906, + "learning_rate": 2.019605052848034e-06, + "loss": 0.4821, + "step": 10316 + }, + { + "epoch": 0.8, + "grad_norm": 1.1561511744974924, + "learning_rate": 2.0180911562555904e-06, + "loss": 0.5153, + "step": 10317 + }, + { + "epoch": 0.8, + "grad_norm": 1.144552809997178, + "learning_rate": 2.016577763602233e-06, + "loss": 0.4528, + "step": 10318 + }, + { + "epoch": 0.8, + "grad_norm": 1.1583653765758537, + "learning_rate": 2.015064874983511e-06, + "loss": 0.5034, + "step": 10319 + }, + { + "epoch": 0.8, + "grad_norm": 1.1670531189146744, + "learning_rate": 2.01355249049494e-06, + "loss": 0.4809, + "step": 10320 + }, + { + "epoch": 0.8, + "grad_norm": 1.149081562082276, + "learning_rate": 2.012040610232008e-06, + "loss": 0.4941, + "step": 10321 + }, + { + "epoch": 0.8, + "grad_norm": 1.2771470504638869, + "learning_rate": 2.0105292342901617e-06, + "loss": 0.5262, + "step": 10322 + }, + { + "epoch": 0.8, + "grad_norm": 1.1730759125785757, + "learning_rate": 2.009018362764825e-06, + "loss": 0.5145, + "step": 10323 + }, + { + "epoch": 0.8, + "grad_norm": 1.2472225326324355, + "learning_rate": 2.007507995751391e-06, + "loss": 0.533, + "step": 10324 + }, + { + "epoch": 0.8, + "grad_norm": 1.3087537069914892, + "learning_rate": 2.005998133345208e-06, + "loss": 0.531, + "step": 10325 + }, + { + "epoch": 0.8, + "grad_norm": 1.1604079156587876, + "learning_rate": 2.004488775641611e-06, + "loss": 0.5015, + "step": 10326 + }, + { + "epoch": 0.8, + "grad_norm": 1.405106779161079, + "learning_rate": 2.002979922735886e-06, + "loss": 0.5834, + "step": 10327 + }, + { + "epoch": 0.8, + "grad_norm": 1.2972890353825752, + "learning_rate": 2.001471574723298e-06, + "loss": 0.549, + "step": 10328 + }, + { + "epoch": 0.8, + "grad_norm": 1.1045161479910992, + "learning_rate": 1.999963731699076e-06, + "loss": 0.4414, + "step": 10329 + }, + { + "epoch": 0.8, + "grad_norm": 1.1388594144144728, + "learning_rate": 1.9984563937584177e-06, + "loss": 0.4868, + "step": 10330 + }, + { + "epoch": 0.8, + "grad_norm": 1.1982704414201537, + "learning_rate": 1.996949560996494e-06, + "loss": 0.5345, + "step": 10331 + }, + { + "epoch": 0.8, + "grad_norm": 1.3591611408953175, + "learning_rate": 1.9954432335084307e-06, + "loss": 0.5535, + "step": 10332 + }, + { + "epoch": 0.8, + "grad_norm": 1.1413411021978321, + "learning_rate": 1.9939374113893353e-06, + "loss": 0.4713, + "step": 10333 + }, + { + "epoch": 0.8, + "grad_norm": 1.2577695661526864, + "learning_rate": 1.992432094734279e-06, + "loss": 0.5294, + "step": 10334 + }, + { + "epoch": 0.8, + "grad_norm": 1.152397645886078, + "learning_rate": 1.9909272836382955e-06, + "loss": 0.4752, + "step": 10335 + }, + { + "epoch": 0.8, + "grad_norm": 1.2084929865016871, + "learning_rate": 1.9894229781963957e-06, + "loss": 0.5307, + "step": 10336 + }, + { + "epoch": 0.8, + "grad_norm": 1.184222466704197, + "learning_rate": 1.9879191785035513e-06, + "loss": 0.5068, + "step": 10337 + }, + { + "epoch": 0.8, + "grad_norm": 1.288310299667858, + "learning_rate": 1.9864158846547054e-06, + "loss": 0.5117, + "step": 10338 + }, + { + "epoch": 0.8, + "grad_norm": 1.1521021185522449, + "learning_rate": 1.9849130967447693e-06, + "loss": 0.4821, + "step": 10339 + }, + { + "epoch": 0.8, + "grad_norm": 1.3211294502113327, + "learning_rate": 1.9834108148686225e-06, + "loss": 0.5224, + "step": 10340 + }, + { + "epoch": 0.8, + "grad_norm": 1.126520877233239, + "learning_rate": 1.9819090391211104e-06, + "loss": 0.498, + "step": 10341 + }, + { + "epoch": 0.8, + "grad_norm": 1.2187549884400615, + "learning_rate": 1.9804077695970513e-06, + "loss": 0.5028, + "step": 10342 + }, + { + "epoch": 0.8, + "grad_norm": 1.3013953056887446, + "learning_rate": 1.978907006391223e-06, + "loss": 0.5243, + "step": 10343 + }, + { + "epoch": 0.8, + "grad_norm": 1.222376002164524, + "learning_rate": 1.977406749598382e-06, + "loss": 0.5209, + "step": 10344 + }, + { + "epoch": 0.8, + "grad_norm": 1.2206317361937802, + "learning_rate": 1.9759069993132405e-06, + "loss": 0.4998, + "step": 10345 + }, + { + "epoch": 0.8, + "grad_norm": 1.1608321664498062, + "learning_rate": 1.9744077556304885e-06, + "loss": 0.4799, + "step": 10346 + }, + { + "epoch": 0.8, + "grad_norm": 1.1636473152101807, + "learning_rate": 1.9729090186447853e-06, + "loss": 0.4657, + "step": 10347 + }, + { + "epoch": 0.8, + "grad_norm": 1.2029562992869192, + "learning_rate": 1.9714107884507474e-06, + "loss": 0.5271, + "step": 10348 + }, + { + "epoch": 0.8, + "grad_norm": 1.2204066534511777, + "learning_rate": 1.9699130651429676e-06, + "loss": 0.4997, + "step": 10349 + }, + { + "epoch": 0.8, + "grad_norm": 1.1497398828640724, + "learning_rate": 1.9684158488160065e-06, + "loss": 0.5064, + "step": 10350 + }, + { + "epoch": 0.8, + "grad_norm": 1.1631250382904588, + "learning_rate": 1.9669191395643906e-06, + "loss": 0.4446, + "step": 10351 + }, + { + "epoch": 0.8, + "grad_norm": 1.1349997615393839, + "learning_rate": 1.965422937482616e-06, + "loss": 0.4859, + "step": 10352 + }, + { + "epoch": 0.8, + "grad_norm": 1.0774769701344173, + "learning_rate": 1.9639272426651435e-06, + "loss": 0.4684, + "step": 10353 + }, + { + "epoch": 0.8, + "grad_norm": 1.1835434402499982, + "learning_rate": 1.962432055206406e-06, + "loss": 0.4571, + "step": 10354 + }, + { + "epoch": 0.8, + "grad_norm": 1.1445839514983547, + "learning_rate": 1.9609373752008e-06, + "loss": 0.5004, + "step": 10355 + }, + { + "epoch": 0.8, + "grad_norm": 1.2275289142101145, + "learning_rate": 1.9594432027426925e-06, + "loss": 0.5227, + "step": 10356 + }, + { + "epoch": 0.8, + "grad_norm": 1.2593028556662307, + "learning_rate": 1.9579495379264223e-06, + "loss": 0.4979, + "step": 10357 + }, + { + "epoch": 0.8, + "grad_norm": 1.2159801699784512, + "learning_rate": 1.9564563808462867e-06, + "loss": 0.5156, + "step": 10358 + }, + { + "epoch": 0.8, + "grad_norm": 1.2114540229036244, + "learning_rate": 1.9549637315965587e-06, + "loss": 0.5387, + "step": 10359 + }, + { + "epoch": 0.8, + "grad_norm": 1.2060288399463288, + "learning_rate": 1.9534715902714775e-06, + "loss": 0.5223, + "step": 10360 + }, + { + "epoch": 0.8, + "grad_norm": 1.2462118923899952, + "learning_rate": 1.9519799569652498e-06, + "loss": 0.5481, + "step": 10361 + }, + { + "epoch": 0.8, + "grad_norm": 1.1920891977481711, + "learning_rate": 1.9504888317720515e-06, + "loss": 0.4916, + "step": 10362 + }, + { + "epoch": 0.8, + "grad_norm": 1.337029930649945, + "learning_rate": 1.948998214786022e-06, + "loss": 0.5658, + "step": 10363 + }, + { + "epoch": 0.8, + "grad_norm": 1.1651741198959462, + "learning_rate": 1.9475081061012746e-06, + "loss": 0.4734, + "step": 10364 + }, + { + "epoch": 0.8, + "grad_norm": 1.0222954821134806, + "learning_rate": 1.946018505811883e-06, + "loss": 0.4284, + "step": 10365 + }, + { + "epoch": 0.8, + "grad_norm": 1.1425155231247648, + "learning_rate": 1.9445294140118965e-06, + "loss": 0.4887, + "step": 10366 + }, + { + "epoch": 0.8, + "grad_norm": 1.1424013180341783, + "learning_rate": 1.9430408307953317e-06, + "loss": 0.5349, + "step": 10367 + }, + { + "epoch": 0.8, + "grad_norm": 1.3395171365157932, + "learning_rate": 1.9415527562561655e-06, + "loss": 0.5672, + "step": 10368 + }, + { + "epoch": 0.8, + "grad_norm": 1.1631960620520938, + "learning_rate": 1.9400651904883492e-06, + "loss": 0.5056, + "step": 10369 + }, + { + "epoch": 0.8, + "grad_norm": 1.1770906476268992, + "learning_rate": 1.9385781335858014e-06, + "loss": 0.4884, + "step": 10370 + }, + { + "epoch": 0.8, + "grad_norm": 1.2783768235575272, + "learning_rate": 1.937091585642408e-06, + "loss": 0.5395, + "step": 10371 + }, + { + "epoch": 0.8, + "grad_norm": 1.1769348264652402, + "learning_rate": 1.935605546752023e-06, + "loss": 0.5312, + "step": 10372 + }, + { + "epoch": 0.8, + "grad_norm": 1.2309911194333034, + "learning_rate": 1.934120017008465e-06, + "loss": 0.5031, + "step": 10373 + }, + { + "epoch": 0.8, + "grad_norm": 1.2335654392581403, + "learning_rate": 1.932634996505528e-06, + "loss": 0.4825, + "step": 10374 + }, + { + "epoch": 0.8, + "grad_norm": 1.1826541305807252, + "learning_rate": 1.931150485336962e-06, + "loss": 0.5027, + "step": 10375 + }, + { + "epoch": 0.8, + "grad_norm": 1.1497857619876597, + "learning_rate": 1.9296664835964975e-06, + "loss": 0.4593, + "step": 10376 + }, + { + "epoch": 0.81, + "grad_norm": 1.2023675689420044, + "learning_rate": 1.928182991377826e-06, + "loss": 0.5154, + "step": 10377 + }, + { + "epoch": 0.81, + "grad_norm": 1.1732740570307119, + "learning_rate": 1.926700008774606e-06, + "loss": 0.467, + "step": 10378 + }, + { + "epoch": 0.81, + "grad_norm": 1.2388419918389142, + "learning_rate": 1.9252175358804657e-06, + "loss": 0.4995, + "step": 10379 + }, + { + "epoch": 0.81, + "grad_norm": 1.2624740465725812, + "learning_rate": 1.9237355727890037e-06, + "loss": 0.525, + "step": 10380 + }, + { + "epoch": 0.81, + "grad_norm": 1.2348135639890256, + "learning_rate": 1.922254119593784e-06, + "loss": 0.4428, + "step": 10381 + }, + { + "epoch": 0.81, + "grad_norm": 1.1873731545412778, + "learning_rate": 1.9207731763883388e-06, + "loss": 0.515, + "step": 10382 + }, + { + "epoch": 0.81, + "grad_norm": 1.393612855741153, + "learning_rate": 1.9192927432661645e-06, + "loss": 0.5578, + "step": 10383 + }, + { + "epoch": 0.81, + "grad_norm": 1.1886440588497955, + "learning_rate": 1.9178128203207324e-06, + "loss": 0.5003, + "step": 10384 + }, + { + "epoch": 0.81, + "grad_norm": 1.2326494541819835, + "learning_rate": 1.916333407645472e-06, + "loss": 0.5637, + "step": 10385 + }, + { + "epoch": 0.81, + "grad_norm": 1.233964441325004, + "learning_rate": 1.914854505333791e-06, + "loss": 0.4889, + "step": 10386 + }, + { + "epoch": 0.81, + "grad_norm": 1.2268538402378648, + "learning_rate": 1.9133761134790618e-06, + "loss": 0.5438, + "step": 10387 + }, + { + "epoch": 0.81, + "grad_norm": 1.0855906090260632, + "learning_rate": 1.9118982321746173e-06, + "loss": 0.4048, + "step": 10388 + }, + { + "epoch": 0.81, + "grad_norm": 1.2265365014692144, + "learning_rate": 1.9104208615137654e-06, + "loss": 0.4727, + "step": 10389 + }, + { + "epoch": 0.81, + "grad_norm": 1.0774274034228972, + "learning_rate": 1.908944001589782e-06, + "loss": 0.469, + "step": 10390 + }, + { + "epoch": 0.81, + "grad_norm": 1.158155572182002, + "learning_rate": 1.907467652495909e-06, + "loss": 0.4746, + "step": 10391 + }, + { + "epoch": 0.81, + "grad_norm": 1.191380384821055, + "learning_rate": 1.9059918143253564e-06, + "loss": 0.4869, + "step": 10392 + }, + { + "epoch": 0.81, + "grad_norm": 1.2269686372754622, + "learning_rate": 1.9045164871713007e-06, + "loss": 0.4924, + "step": 10393 + }, + { + "epoch": 0.81, + "grad_norm": 1.1213116583568241, + "learning_rate": 1.903041671126884e-06, + "loss": 0.4862, + "step": 10394 + }, + { + "epoch": 0.81, + "grad_norm": 1.2229470099216209, + "learning_rate": 1.9015673662852207e-06, + "loss": 0.5175, + "step": 10395 + }, + { + "epoch": 0.81, + "grad_norm": 1.3604078205905663, + "learning_rate": 1.900093572739392e-06, + "loss": 0.5074, + "step": 10396 + }, + { + "epoch": 0.81, + "grad_norm": 1.1943786064504713, + "learning_rate": 1.898620290582447e-06, + "loss": 0.474, + "step": 10397 + }, + { + "epoch": 0.81, + "grad_norm": 1.2919564332289357, + "learning_rate": 1.897147519907403e-06, + "loss": 0.5327, + "step": 10398 + }, + { + "epoch": 0.81, + "grad_norm": 1.187736387315714, + "learning_rate": 1.8956752608072382e-06, + "loss": 0.5016, + "step": 10399 + }, + { + "epoch": 0.81, + "grad_norm": 1.1687796339062262, + "learning_rate": 1.894203513374907e-06, + "loss": 0.5086, + "step": 10400 + }, + { + "epoch": 0.81, + "grad_norm": 1.3468612316162474, + "learning_rate": 1.8927322777033285e-06, + "loss": 0.5067, + "step": 10401 + }, + { + "epoch": 0.81, + "grad_norm": 1.2436114612654876, + "learning_rate": 1.8912615538853919e-06, + "loss": 0.5292, + "step": 10402 + }, + { + "epoch": 0.81, + "grad_norm": 1.2027217139543736, + "learning_rate": 1.8897913420139492e-06, + "loss": 0.4489, + "step": 10403 + }, + { + "epoch": 0.81, + "grad_norm": 1.1082299999357392, + "learning_rate": 1.8883216421818196e-06, + "loss": 0.4796, + "step": 10404 + }, + { + "epoch": 0.81, + "grad_norm": 1.1507077257710712, + "learning_rate": 1.8868524544817957e-06, + "loss": 0.5064, + "step": 10405 + }, + { + "epoch": 0.81, + "grad_norm": 1.1087962507989588, + "learning_rate": 1.8853837790066343e-06, + "loss": 0.511, + "step": 10406 + }, + { + "epoch": 0.81, + "grad_norm": 1.2132909240140473, + "learning_rate": 1.8839156158490612e-06, + "loss": 0.5222, + "step": 10407 + }, + { + "epoch": 0.81, + "grad_norm": 1.2021377277904344, + "learning_rate": 1.8824479651017712e-06, + "loss": 0.4608, + "step": 10408 + }, + { + "epoch": 0.81, + "grad_norm": 1.183529137588327, + "learning_rate": 1.8809808268574192e-06, + "loss": 0.5101, + "step": 10409 + }, + { + "epoch": 0.81, + "grad_norm": 1.1541069894412248, + "learning_rate": 1.8795142012086364e-06, + "loss": 0.4803, + "step": 10410 + }, + { + "epoch": 0.81, + "grad_norm": 1.2045538208410849, + "learning_rate": 1.8780480882480189e-06, + "loss": 0.5278, + "step": 10411 + }, + { + "epoch": 0.81, + "grad_norm": 1.3228246952236407, + "learning_rate": 1.8765824880681317e-06, + "loss": 0.5211, + "step": 10412 + }, + { + "epoch": 0.81, + "grad_norm": 1.2195136783172147, + "learning_rate": 1.8751174007615026e-06, + "loss": 0.4967, + "step": 10413 + }, + { + "epoch": 0.81, + "grad_norm": 1.2479047858554162, + "learning_rate": 1.8736528264206289e-06, + "loss": 0.5097, + "step": 10414 + }, + { + "epoch": 0.81, + "grad_norm": 1.158638832002312, + "learning_rate": 1.872188765137979e-06, + "loss": 0.5247, + "step": 10415 + }, + { + "epoch": 0.81, + "grad_norm": 1.10435893825904, + "learning_rate": 1.8707252170059864e-06, + "loss": 0.4589, + "step": 10416 + }, + { + "epoch": 0.81, + "grad_norm": 1.214204184415717, + "learning_rate": 1.869262182117052e-06, + "loss": 0.4974, + "step": 10417 + }, + { + "epoch": 0.81, + "grad_norm": 1.146734571044995, + "learning_rate": 1.8677996605635473e-06, + "loss": 0.4902, + "step": 10418 + }, + { + "epoch": 0.81, + "grad_norm": 1.1523582845514786, + "learning_rate": 1.866337652437805e-06, + "loss": 0.5315, + "step": 10419 + }, + { + "epoch": 0.81, + "grad_norm": 1.1235626893729556, + "learning_rate": 1.8648761578321296e-06, + "loss": 0.4829, + "step": 10420 + }, + { + "epoch": 0.81, + "grad_norm": 1.1653543766647423, + "learning_rate": 1.8634151768387954e-06, + "loss": 0.4622, + "step": 10421 + }, + { + "epoch": 0.81, + "grad_norm": 1.0817316515014181, + "learning_rate": 1.8619547095500423e-06, + "loss": 0.4459, + "step": 10422 + }, + { + "epoch": 0.81, + "grad_norm": 1.241798531914272, + "learning_rate": 1.8604947560580756e-06, + "loss": 0.4962, + "step": 10423 + }, + { + "epoch": 0.81, + "grad_norm": 1.1386425609970265, + "learning_rate": 1.8590353164550656e-06, + "loss": 0.5111, + "step": 10424 + }, + { + "epoch": 0.81, + "grad_norm": 1.244623207387527, + "learning_rate": 1.857576390833159e-06, + "loss": 0.5167, + "step": 10425 + }, + { + "epoch": 0.81, + "grad_norm": 1.172029001925191, + "learning_rate": 1.8561179792844642e-06, + "loss": 0.5243, + "step": 10426 + }, + { + "epoch": 0.81, + "grad_norm": 1.2720301053185297, + "learning_rate": 1.8546600819010575e-06, + "loss": 0.5693, + "step": 10427 + }, + { + "epoch": 0.81, + "grad_norm": 1.156249742250156, + "learning_rate": 1.8532026987749874e-06, + "loss": 0.4858, + "step": 10428 + }, + { + "epoch": 0.81, + "grad_norm": 1.2187584607735416, + "learning_rate": 1.8517458299982604e-06, + "loss": 0.519, + "step": 10429 + }, + { + "epoch": 0.81, + "grad_norm": 1.2533085429414896, + "learning_rate": 1.8502894756628587e-06, + "loss": 0.5415, + "step": 10430 + }, + { + "epoch": 0.81, + "grad_norm": 1.1305972307716194, + "learning_rate": 1.8488336358607296e-06, + "loss": 0.453, + "step": 10431 + }, + { + "epoch": 0.81, + "grad_norm": 1.2000558522260178, + "learning_rate": 1.8473783106837896e-06, + "loss": 0.4781, + "step": 10432 + }, + { + "epoch": 0.81, + "grad_norm": 1.1443368271399246, + "learning_rate": 1.8459235002239183e-06, + "loss": 0.4921, + "step": 10433 + }, + { + "epoch": 0.81, + "grad_norm": 1.2608846263364868, + "learning_rate": 1.844469204572964e-06, + "loss": 0.557, + "step": 10434 + }, + { + "epoch": 0.81, + "grad_norm": 1.1178104824424413, + "learning_rate": 1.843015423822746e-06, + "loss": 0.4885, + "step": 10435 + }, + { + "epoch": 0.81, + "grad_norm": 1.1748830209493364, + "learning_rate": 1.841562158065049e-06, + "loss": 0.5095, + "step": 10436 + }, + { + "epoch": 0.81, + "grad_norm": 1.165367367987973, + "learning_rate": 1.8401094073916237e-06, + "loss": 0.4664, + "step": 10437 + }, + { + "epoch": 0.81, + "grad_norm": 1.295025483326214, + "learning_rate": 1.8386571718941947e-06, + "loss": 0.5833, + "step": 10438 + }, + { + "epoch": 0.81, + "grad_norm": 1.1498459464009567, + "learning_rate": 1.8372054516644422e-06, + "loss": 0.4455, + "step": 10439 + }, + { + "epoch": 0.81, + "grad_norm": 1.1600520539936359, + "learning_rate": 1.8357542467940249e-06, + "loss": 0.4456, + "step": 10440 + }, + { + "epoch": 0.81, + "grad_norm": 1.1858258241284776, + "learning_rate": 1.8343035573745637e-06, + "loss": 0.4609, + "step": 10441 + }, + { + "epoch": 0.81, + "grad_norm": 1.2969841968570677, + "learning_rate": 1.8328533834976503e-06, + "loss": 0.538, + "step": 10442 + }, + { + "epoch": 0.81, + "grad_norm": 1.247649653446576, + "learning_rate": 1.83140372525484e-06, + "loss": 0.5145, + "step": 10443 + }, + { + "epoch": 0.81, + "grad_norm": 1.1970741903024715, + "learning_rate": 1.8299545827376552e-06, + "loss": 0.5031, + "step": 10444 + }, + { + "epoch": 0.81, + "grad_norm": 1.1042433778038725, + "learning_rate": 1.8285059560375883e-06, + "loss": 0.5108, + "step": 10445 + }, + { + "epoch": 0.81, + "grad_norm": 1.154651619607953, + "learning_rate": 1.8270578452461007e-06, + "loss": 0.4967, + "step": 10446 + }, + { + "epoch": 0.81, + "grad_norm": 1.1493458691172969, + "learning_rate": 1.825610250454618e-06, + "loss": 0.5224, + "step": 10447 + }, + { + "epoch": 0.81, + "grad_norm": 1.2318701620021442, + "learning_rate": 1.824163171754536e-06, + "loss": 0.4985, + "step": 10448 + }, + { + "epoch": 0.81, + "grad_norm": 1.0756030232180962, + "learning_rate": 1.8227166092372138e-06, + "loss": 0.4431, + "step": 10449 + }, + { + "epoch": 0.81, + "grad_norm": 1.1653508986481198, + "learning_rate": 1.8212705629939798e-06, + "loss": 0.5148, + "step": 10450 + }, + { + "epoch": 0.81, + "grad_norm": 1.4137273992052395, + "learning_rate": 1.8198250331161327e-06, + "loss": 0.5678, + "step": 10451 + }, + { + "epoch": 0.81, + "grad_norm": 1.2140237180978994, + "learning_rate": 1.8183800196949375e-06, + "loss": 0.5141, + "step": 10452 + }, + { + "epoch": 0.81, + "grad_norm": 1.1900106792611889, + "learning_rate": 1.8169355228216211e-06, + "loss": 0.524, + "step": 10453 + }, + { + "epoch": 0.81, + "grad_norm": 1.2498308544158938, + "learning_rate": 1.8154915425873865e-06, + "loss": 0.4813, + "step": 10454 + }, + { + "epoch": 0.81, + "grad_norm": 1.2530844303708175, + "learning_rate": 1.8140480790833958e-06, + "loss": 0.5208, + "step": 10455 + }, + { + "epoch": 0.81, + "grad_norm": 1.1372842856635434, + "learning_rate": 1.8126051324007821e-06, + "loss": 0.4889, + "step": 10456 + }, + { + "epoch": 0.81, + "grad_norm": 1.1247584825455763, + "learning_rate": 1.8111627026306488e-06, + "loss": 0.4374, + "step": 10457 + }, + { + "epoch": 0.81, + "grad_norm": 1.2168712807526494, + "learning_rate": 1.8097207898640633e-06, + "loss": 0.4463, + "step": 10458 + }, + { + "epoch": 0.81, + "grad_norm": 1.215735693748371, + "learning_rate": 1.808279394192063e-06, + "loss": 0.5298, + "step": 10459 + }, + { + "epoch": 0.81, + "grad_norm": 1.3411277646325948, + "learning_rate": 1.8068385157056446e-06, + "loss": 0.5594, + "step": 10460 + }, + { + "epoch": 0.81, + "grad_norm": 1.203137880727011, + "learning_rate": 1.8053981544957832e-06, + "loss": 0.5018, + "step": 10461 + }, + { + "epoch": 0.81, + "grad_norm": 1.3381786175693573, + "learning_rate": 1.8039583106534164e-06, + "loss": 0.5225, + "step": 10462 + }, + { + "epoch": 0.81, + "grad_norm": 1.1489099062930568, + "learning_rate": 1.8025189842694458e-06, + "loss": 0.5054, + "step": 10463 + }, + { + "epoch": 0.81, + "grad_norm": 1.280028575116321, + "learning_rate": 1.8010801754347473e-06, + "loss": 0.5046, + "step": 10464 + }, + { + "epoch": 0.81, + "grad_norm": 1.1998320760058416, + "learning_rate": 1.7996418842401552e-06, + "loss": 0.5023, + "step": 10465 + }, + { + "epoch": 0.81, + "grad_norm": 1.277502440881357, + "learning_rate": 1.7982041107764803e-06, + "loss": 0.5216, + "step": 10466 + }, + { + "epoch": 0.81, + "grad_norm": 1.2627140052633197, + "learning_rate": 1.796766855134494e-06, + "loss": 0.5333, + "step": 10467 + }, + { + "epoch": 0.81, + "grad_norm": 1.3363706990641024, + "learning_rate": 1.795330117404941e-06, + "loss": 0.4937, + "step": 10468 + }, + { + "epoch": 0.81, + "grad_norm": 1.1386407811939736, + "learning_rate": 1.7938938976785302e-06, + "loss": 0.5339, + "step": 10469 + }, + { + "epoch": 0.81, + "grad_norm": 1.1885433131258651, + "learning_rate": 1.792458196045932e-06, + "loss": 0.4912, + "step": 10470 + }, + { + "epoch": 0.81, + "grad_norm": 1.1723792453554376, + "learning_rate": 1.7910230125977945e-06, + "loss": 0.483, + "step": 10471 + }, + { + "epoch": 0.81, + "grad_norm": 1.3337188948031438, + "learning_rate": 1.789588347424729e-06, + "loss": 0.5345, + "step": 10472 + }, + { + "epoch": 0.81, + "grad_norm": 1.1924109553234687, + "learning_rate": 1.7881542006173091e-06, + "loss": 0.5484, + "step": 10473 + }, + { + "epoch": 0.81, + "grad_norm": 1.2019992862896063, + "learning_rate": 1.786720572266084e-06, + "loss": 0.4812, + "step": 10474 + }, + { + "epoch": 0.81, + "grad_norm": 1.1503896923741443, + "learning_rate": 1.7852874624615624e-06, + "loss": 0.5159, + "step": 10475 + }, + { + "epoch": 0.81, + "grad_norm": 1.1572777072855462, + "learning_rate": 1.783854871294225e-06, + "loss": 0.4836, + "step": 10476 + }, + { + "epoch": 0.81, + "grad_norm": 1.1532732302287767, + "learning_rate": 1.7824227988545194e-06, + "loss": 0.509, + "step": 10477 + }, + { + "epoch": 0.81, + "grad_norm": 1.3525361980311499, + "learning_rate": 1.7809912452328592e-06, + "loss": 0.5405, + "step": 10478 + }, + { + "epoch": 0.81, + "grad_norm": 1.25077342901221, + "learning_rate": 1.7795602105196297e-06, + "loss": 0.543, + "step": 10479 + }, + { + "epoch": 0.81, + "grad_norm": 1.1658554094872522, + "learning_rate": 1.778129694805173e-06, + "loss": 0.4991, + "step": 10480 + }, + { + "epoch": 0.81, + "grad_norm": 1.216405128865592, + "learning_rate": 1.776699698179808e-06, + "loss": 0.4853, + "step": 10481 + }, + { + "epoch": 0.81, + "grad_norm": 1.162459735788342, + "learning_rate": 1.7752702207338202e-06, + "loss": 0.5001, + "step": 10482 + }, + { + "epoch": 0.81, + "grad_norm": 1.1763861231425468, + "learning_rate": 1.7738412625574542e-06, + "loss": 0.4947, + "step": 10483 + }, + { + "epoch": 0.81, + "grad_norm": 1.3161269437790237, + "learning_rate": 1.7724128237409344e-06, + "loss": 0.5357, + "step": 10484 + }, + { + "epoch": 0.81, + "grad_norm": 1.297538541614623, + "learning_rate": 1.7709849043744387e-06, + "loss": 0.5203, + "step": 10485 + }, + { + "epoch": 0.81, + "grad_norm": 1.199755321830302, + "learning_rate": 1.7695575045481218e-06, + "loss": 0.4795, + "step": 10486 + }, + { + "epoch": 0.81, + "grad_norm": 1.1301971763142185, + "learning_rate": 1.7681306243521035e-06, + "loss": 0.4911, + "step": 10487 + }, + { + "epoch": 0.81, + "grad_norm": 1.0538742320723389, + "learning_rate": 1.7667042638764697e-06, + "loss": 0.4589, + "step": 10488 + }, + { + "epoch": 0.81, + "grad_norm": 1.0407850858339627, + "learning_rate": 1.7652784232112763e-06, + "loss": 0.4434, + "step": 10489 + }, + { + "epoch": 0.81, + "grad_norm": 1.1943022005021457, + "learning_rate": 1.76385310244654e-06, + "loss": 0.5133, + "step": 10490 + }, + { + "epoch": 0.81, + "grad_norm": 1.3291034908687103, + "learning_rate": 1.762428301672251e-06, + "loss": 0.5734, + "step": 10491 + }, + { + "epoch": 0.81, + "grad_norm": 1.3266406122761054, + "learning_rate": 1.761004020978363e-06, + "loss": 0.6024, + "step": 10492 + }, + { + "epoch": 0.81, + "grad_norm": 1.1645543640209564, + "learning_rate": 1.7595802604547974e-06, + "loss": 0.496, + "step": 10493 + }, + { + "epoch": 0.81, + "grad_norm": 1.2184970299735831, + "learning_rate": 1.7581570201914478e-06, + "loss": 0.5359, + "step": 10494 + }, + { + "epoch": 0.81, + "grad_norm": 1.1513206983762794, + "learning_rate": 1.7567343002781656e-06, + "loss": 0.5129, + "step": 10495 + }, + { + "epoch": 0.81, + "grad_norm": 1.330582264097016, + "learning_rate": 1.7553121008047768e-06, + "loss": 0.5321, + "step": 10496 + }, + { + "epoch": 0.81, + "grad_norm": 1.209381827007413, + "learning_rate": 1.753890421861072e-06, + "loss": 0.5503, + "step": 10497 + }, + { + "epoch": 0.81, + "grad_norm": 1.2934417781312084, + "learning_rate": 1.752469263536809e-06, + "loss": 0.5061, + "step": 10498 + }, + { + "epoch": 0.81, + "grad_norm": 1.2345777900323305, + "learning_rate": 1.7510486259217151e-06, + "loss": 0.5093, + "step": 10499 + }, + { + "epoch": 0.81, + "grad_norm": 1.2739734574941015, + "learning_rate": 1.7496285091054788e-06, + "loss": 0.5533, + "step": 10500 + }, + { + "epoch": 0.81, + "grad_norm": 1.1736133144039462, + "learning_rate": 1.7482089131777635e-06, + "loss": 0.4895, + "step": 10501 + }, + { + "epoch": 0.81, + "grad_norm": 1.244689484981838, + "learning_rate": 1.74678983822819e-06, + "loss": 0.4912, + "step": 10502 + }, + { + "epoch": 0.81, + "grad_norm": 1.3912718479329513, + "learning_rate": 1.7453712843463554e-06, + "loss": 0.5747, + "step": 10503 + }, + { + "epoch": 0.81, + "grad_norm": 1.2169156085479538, + "learning_rate": 1.7439532516218226e-06, + "loss": 0.5029, + "step": 10504 + }, + { + "epoch": 0.81, + "grad_norm": 1.1270013068359976, + "learning_rate": 1.7425357401441134e-06, + "loss": 0.4715, + "step": 10505 + }, + { + "epoch": 0.82, + "grad_norm": 1.253966570677824, + "learning_rate": 1.7411187500027272e-06, + "loss": 0.5125, + "step": 10506 + }, + { + "epoch": 0.82, + "grad_norm": 1.2157143175119676, + "learning_rate": 1.7397022812871234e-06, + "loss": 0.5049, + "step": 10507 + }, + { + "epoch": 0.82, + "grad_norm": 1.2118043811480281, + "learning_rate": 1.7382863340867316e-06, + "loss": 0.5172, + "step": 10508 + }, + { + "epoch": 0.82, + "grad_norm": 1.3806981878883626, + "learning_rate": 1.7368709084909496e-06, + "loss": 0.5133, + "step": 10509 + }, + { + "epoch": 0.82, + "grad_norm": 1.2219046333017223, + "learning_rate": 1.735456004589141e-06, + "loss": 0.4581, + "step": 10510 + }, + { + "epoch": 0.82, + "grad_norm": 1.217999594091714, + "learning_rate": 1.7340416224706346e-06, + "loss": 0.4843, + "step": 10511 + }, + { + "epoch": 0.82, + "grad_norm": 1.162585044129144, + "learning_rate": 1.7326277622247245e-06, + "loss": 0.5077, + "step": 10512 + }, + { + "epoch": 0.82, + "grad_norm": 1.092069042587265, + "learning_rate": 1.7312144239406781e-06, + "loss": 0.4631, + "step": 10513 + }, + { + "epoch": 0.82, + "grad_norm": 1.3889924228013881, + "learning_rate": 1.7298016077077273e-06, + "loss": 0.5579, + "step": 10514 + }, + { + "epoch": 0.82, + "grad_norm": 1.1529110821220014, + "learning_rate": 1.728389313615071e-06, + "loss": 0.4757, + "step": 10515 + }, + { + "epoch": 0.82, + "grad_norm": 1.31379771791104, + "learning_rate": 1.7269775417518708e-06, + "loss": 0.5394, + "step": 10516 + }, + { + "epoch": 0.82, + "grad_norm": 1.2688102670270993, + "learning_rate": 1.7255662922072613e-06, + "loss": 0.505, + "step": 10517 + }, + { + "epoch": 0.82, + "grad_norm": 1.2353242531875206, + "learning_rate": 1.7241555650703433e-06, + "loss": 0.5218, + "step": 10518 + }, + { + "epoch": 0.82, + "grad_norm": 1.267278321347615, + "learning_rate": 1.7227453604301814e-06, + "loss": 0.4969, + "step": 10519 + }, + { + "epoch": 0.82, + "grad_norm": 1.1978121781043962, + "learning_rate": 1.7213356783758128e-06, + "loss": 0.4724, + "step": 10520 + }, + { + "epoch": 0.82, + "grad_norm": 1.210791006763986, + "learning_rate": 1.7199265189962345e-06, + "loss": 0.5214, + "step": 10521 + }, + { + "epoch": 0.82, + "grad_norm": 1.239744314858782, + "learning_rate": 1.7185178823804127e-06, + "loss": 0.553, + "step": 10522 + }, + { + "epoch": 0.82, + "grad_norm": 1.2265063716413458, + "learning_rate": 1.7171097686172832e-06, + "loss": 0.5414, + "step": 10523 + }, + { + "epoch": 0.82, + "grad_norm": 1.1923382225103458, + "learning_rate": 1.7157021777957494e-06, + "loss": 0.4642, + "step": 10524 + }, + { + "epoch": 0.82, + "grad_norm": 1.131043308828589, + "learning_rate": 1.7142951100046802e-06, + "loss": 0.4543, + "step": 10525 + }, + { + "epoch": 0.82, + "grad_norm": 1.1441874329436754, + "learning_rate": 1.7128885653329065e-06, + "loss": 0.4569, + "step": 10526 + }, + { + "epoch": 0.82, + "grad_norm": 1.1432271043531488, + "learning_rate": 1.711482543869234e-06, + "loss": 0.5425, + "step": 10527 + }, + { + "epoch": 0.82, + "grad_norm": 1.0816328506493416, + "learning_rate": 1.7100770457024307e-06, + "loss": 0.5059, + "step": 10528 + }, + { + "epoch": 0.82, + "grad_norm": 1.4127508582100923, + "learning_rate": 1.7086720709212357e-06, + "loss": 0.5717, + "step": 10529 + }, + { + "epoch": 0.82, + "grad_norm": 1.156142100892866, + "learning_rate": 1.7072676196143512e-06, + "loss": 0.5183, + "step": 10530 + }, + { + "epoch": 0.82, + "grad_norm": 1.1123369997604466, + "learning_rate": 1.7058636918704474e-06, + "loss": 0.4447, + "step": 10531 + }, + { + "epoch": 0.82, + "grad_norm": 1.188770768699168, + "learning_rate": 1.704460287778159e-06, + "loss": 0.4777, + "step": 10532 + }, + { + "epoch": 0.82, + "grad_norm": 1.3072897263876002, + "learning_rate": 1.7030574074260908e-06, + "loss": 0.5636, + "step": 10533 + }, + { + "epoch": 0.82, + "grad_norm": 1.143892650359272, + "learning_rate": 1.7016550509028162e-06, + "loss": 0.4896, + "step": 10534 + }, + { + "epoch": 0.82, + "grad_norm": 1.130228977044135, + "learning_rate": 1.7002532182968734e-06, + "loss": 0.4782, + "step": 10535 + }, + { + "epoch": 0.82, + "grad_norm": 1.1603531590084593, + "learning_rate": 1.6988519096967647e-06, + "loss": 0.5064, + "step": 10536 + }, + { + "epoch": 0.82, + "grad_norm": 1.1335112948567967, + "learning_rate": 1.697451125190962e-06, + "loss": 0.508, + "step": 10537 + }, + { + "epoch": 0.82, + "grad_norm": 1.2376035646971673, + "learning_rate": 1.6960508648679064e-06, + "loss": 0.5099, + "step": 10538 + }, + { + "epoch": 0.82, + "grad_norm": 1.2352067101031956, + "learning_rate": 1.6946511288160017e-06, + "loss": 0.5089, + "step": 10539 + }, + { + "epoch": 0.82, + "grad_norm": 1.1461286626593559, + "learning_rate": 1.693251917123624e-06, + "loss": 0.4921, + "step": 10540 + }, + { + "epoch": 0.82, + "grad_norm": 1.3278851460806542, + "learning_rate": 1.6918532298791102e-06, + "loss": 0.5413, + "step": 10541 + }, + { + "epoch": 0.82, + "grad_norm": 1.2688491162450473, + "learning_rate": 1.6904550671707632e-06, + "loss": 0.4886, + "step": 10542 + }, + { + "epoch": 0.82, + "grad_norm": 1.3579935969312724, + "learning_rate": 1.68905742908686e-06, + "loss": 0.5718, + "step": 10543 + }, + { + "epoch": 0.82, + "grad_norm": 1.2583445495880312, + "learning_rate": 1.6876603157156402e-06, + "loss": 0.5128, + "step": 10544 + }, + { + "epoch": 0.82, + "grad_norm": 1.1596382603102533, + "learning_rate": 1.686263727145313e-06, + "loss": 0.4761, + "step": 10545 + }, + { + "epoch": 0.82, + "grad_norm": 1.4515619382662144, + "learning_rate": 1.684867663464047e-06, + "loss": 0.4988, + "step": 10546 + }, + { + "epoch": 0.82, + "grad_norm": 1.1546457863836639, + "learning_rate": 1.6834721247599871e-06, + "loss": 0.5124, + "step": 10547 + }, + { + "epoch": 0.82, + "grad_norm": 1.371756542887517, + "learning_rate": 1.6820771111212386e-06, + "loss": 0.5701, + "step": 10548 + }, + { + "epoch": 0.82, + "grad_norm": 1.2650184590798446, + "learning_rate": 1.6806826226358775e-06, + "loss": 0.4859, + "step": 10549 + }, + { + "epoch": 0.82, + "grad_norm": 1.297595134400607, + "learning_rate": 1.6792886593919454e-06, + "loss": 0.5629, + "step": 10550 + }, + { + "epoch": 0.82, + "grad_norm": 1.1903674490401843, + "learning_rate": 1.6778952214774513e-06, + "loss": 0.4868, + "step": 10551 + }, + { + "epoch": 0.82, + "grad_norm": 1.2320864261712654, + "learning_rate": 1.6765023089803645e-06, + "loss": 0.4971, + "step": 10552 + }, + { + "epoch": 0.82, + "grad_norm": 1.2018654410408118, + "learning_rate": 1.6751099219886314e-06, + "loss": 0.4749, + "step": 10553 + }, + { + "epoch": 0.82, + "grad_norm": 1.1420793536920715, + "learning_rate": 1.6737180605901592e-06, + "loss": 0.4877, + "step": 10554 + }, + { + "epoch": 0.82, + "grad_norm": 1.2102358354142642, + "learning_rate": 1.672326724872827e-06, + "loss": 0.4966, + "step": 10555 + }, + { + "epoch": 0.82, + "grad_norm": 1.1361701401783366, + "learning_rate": 1.6709359149244708e-06, + "loss": 0.4591, + "step": 10556 + }, + { + "epoch": 0.82, + "grad_norm": 1.195401406565811, + "learning_rate": 1.6695456308329027e-06, + "loss": 0.4591, + "step": 10557 + }, + { + "epoch": 0.82, + "grad_norm": 1.2573925288634176, + "learning_rate": 1.6681558726858983e-06, + "loss": 0.5205, + "step": 10558 + }, + { + "epoch": 0.82, + "grad_norm": 1.2768040721461484, + "learning_rate": 1.6667666405712002e-06, + "loss": 0.4884, + "step": 10559 + }, + { + "epoch": 0.82, + "grad_norm": 1.193325860135069, + "learning_rate": 1.6653779345765209e-06, + "loss": 0.5381, + "step": 10560 + }, + { + "epoch": 0.82, + "grad_norm": 1.0951323356961833, + "learning_rate": 1.6639897547895334e-06, + "loss": 0.4846, + "step": 10561 + }, + { + "epoch": 0.82, + "grad_norm": 1.2313968125547885, + "learning_rate": 1.662602101297879e-06, + "loss": 0.4823, + "step": 10562 + }, + { + "epoch": 0.82, + "grad_norm": 1.1681266856500292, + "learning_rate": 1.661214974189168e-06, + "loss": 0.4634, + "step": 10563 + }, + { + "epoch": 0.82, + "grad_norm": 1.2898688366470343, + "learning_rate": 1.6598283735509791e-06, + "loss": 0.5213, + "step": 10564 + }, + { + "epoch": 0.82, + "grad_norm": 1.2660283988159908, + "learning_rate": 1.6584422994708539e-06, + "loss": 0.5922, + "step": 10565 + }, + { + "epoch": 0.82, + "grad_norm": 1.2480377053653755, + "learning_rate": 1.6570567520363058e-06, + "loss": 0.5292, + "step": 10566 + }, + { + "epoch": 0.82, + "grad_norm": 1.1522369335258873, + "learning_rate": 1.6556717313348058e-06, + "loss": 0.4968, + "step": 10567 + }, + { + "epoch": 0.82, + "grad_norm": 1.2200183626262613, + "learning_rate": 1.654287237453801e-06, + "loss": 0.5661, + "step": 10568 + }, + { + "epoch": 0.82, + "grad_norm": 1.1331459343938992, + "learning_rate": 1.6529032704807012e-06, + "loss": 0.4938, + "step": 10569 + }, + { + "epoch": 0.82, + "grad_norm": 1.2208820669626403, + "learning_rate": 1.6515198305028868e-06, + "loss": 0.5376, + "step": 10570 + }, + { + "epoch": 0.82, + "grad_norm": 1.313192139642635, + "learning_rate": 1.6501369176076964e-06, + "loss": 0.5588, + "step": 10571 + }, + { + "epoch": 0.82, + "grad_norm": 1.126480452979602, + "learning_rate": 1.6487545318824405e-06, + "loss": 0.4844, + "step": 10572 + }, + { + "epoch": 0.82, + "grad_norm": 1.0932797102067502, + "learning_rate": 1.6473726734143969e-06, + "loss": 0.4108, + "step": 10573 + }, + { + "epoch": 0.82, + "grad_norm": 1.1992780937894623, + "learning_rate": 1.6459913422908113e-06, + "loss": 0.5026, + "step": 10574 + }, + { + "epoch": 0.82, + "grad_norm": 1.1143017826214296, + "learning_rate": 1.6446105385988932e-06, + "loss": 0.479, + "step": 10575 + }, + { + "epoch": 0.82, + "grad_norm": 1.0995762550691859, + "learning_rate": 1.6432302624258211e-06, + "loss": 0.4681, + "step": 10576 + }, + { + "epoch": 0.82, + "grad_norm": 1.1709815115963567, + "learning_rate": 1.641850513858737e-06, + "loss": 0.4255, + "step": 10577 + }, + { + "epoch": 0.82, + "grad_norm": 1.1894593387435193, + "learning_rate": 1.640471292984751e-06, + "loss": 0.5529, + "step": 10578 + }, + { + "epoch": 0.82, + "grad_norm": 1.1155630185302747, + "learning_rate": 1.6390925998909423e-06, + "loss": 0.4771, + "step": 10579 + }, + { + "epoch": 0.82, + "grad_norm": 1.2387560586944755, + "learning_rate": 1.637714434664357e-06, + "loss": 0.5498, + "step": 10580 + }, + { + "epoch": 0.82, + "grad_norm": 1.172648314589578, + "learning_rate": 1.6363367973920031e-06, + "loss": 0.4916, + "step": 10581 + }, + { + "epoch": 0.82, + "grad_norm": 1.1733047918258428, + "learning_rate": 1.6349596881608555e-06, + "loss": 0.5129, + "step": 10582 + }, + { + "epoch": 0.82, + "grad_norm": 1.0808998555280984, + "learning_rate": 1.6335831070578612e-06, + "loss": 0.4442, + "step": 10583 + }, + { + "epoch": 0.82, + "grad_norm": 1.420055189000459, + "learning_rate": 1.6322070541699298e-06, + "loss": 0.5632, + "step": 10584 + }, + { + "epoch": 0.82, + "grad_norm": 1.4057960625309385, + "learning_rate": 1.6308315295839395e-06, + "loss": 0.5387, + "step": 10585 + }, + { + "epoch": 0.82, + "grad_norm": 1.0657794493674142, + "learning_rate": 1.6294565333867362e-06, + "loss": 0.4544, + "step": 10586 + }, + { + "epoch": 0.82, + "grad_norm": 1.0917078299199416, + "learning_rate": 1.6280820656651252e-06, + "loss": 0.4467, + "step": 10587 + }, + { + "epoch": 0.82, + "grad_norm": 1.1970082639601753, + "learning_rate": 1.6267081265058882e-06, + "loss": 0.5093, + "step": 10588 + }, + { + "epoch": 0.82, + "grad_norm": 1.0844817249205754, + "learning_rate": 1.6253347159957666e-06, + "loss": 0.4324, + "step": 10589 + }, + { + "epoch": 0.82, + "grad_norm": 1.3469163273181914, + "learning_rate": 1.6239618342214746e-06, + "loss": 0.4984, + "step": 10590 + }, + { + "epoch": 0.82, + "grad_norm": 1.1597337563576289, + "learning_rate": 1.6225894812696875e-06, + "loss": 0.5273, + "step": 10591 + }, + { + "epoch": 0.82, + "grad_norm": 1.1174359345473404, + "learning_rate": 1.6212176572270445e-06, + "loss": 0.5066, + "step": 10592 + }, + { + "epoch": 0.82, + "grad_norm": 1.2679483728187548, + "learning_rate": 1.6198463621801607e-06, + "loss": 0.5363, + "step": 10593 + }, + { + "epoch": 0.82, + "grad_norm": 1.1787600376676683, + "learning_rate": 1.6184755962156118e-06, + "loss": 0.4586, + "step": 10594 + }, + { + "epoch": 0.82, + "grad_norm": 1.1610539107147957, + "learning_rate": 1.6171053594199403e-06, + "loss": 0.4825, + "step": 10595 + }, + { + "epoch": 0.82, + "grad_norm": 1.1591404536227137, + "learning_rate": 1.6157356518796608e-06, + "loss": 0.4964, + "step": 10596 + }, + { + "epoch": 0.82, + "grad_norm": 1.2739170786222964, + "learning_rate": 1.6143664736812449e-06, + "loss": 0.5388, + "step": 10597 + }, + { + "epoch": 0.82, + "grad_norm": 1.1487295596822786, + "learning_rate": 1.6129978249111366e-06, + "loss": 0.5308, + "step": 10598 + }, + { + "epoch": 0.82, + "grad_norm": 1.053988585426585, + "learning_rate": 1.6116297056557507e-06, + "loss": 0.4454, + "step": 10599 + }, + { + "epoch": 0.82, + "grad_norm": 1.140154493303633, + "learning_rate": 1.6102621160014563e-06, + "loss": 0.4721, + "step": 10600 + }, + { + "epoch": 0.82, + "grad_norm": 1.0979149130144294, + "learning_rate": 1.6088950560346017e-06, + "loss": 0.5351, + "step": 10601 + }, + { + "epoch": 0.82, + "grad_norm": 1.3602556029114805, + "learning_rate": 1.607528525841493e-06, + "loss": 0.5462, + "step": 10602 + }, + { + "epoch": 0.82, + "grad_norm": 1.2487420905304725, + "learning_rate": 1.6061625255084079e-06, + "loss": 0.4939, + "step": 10603 + }, + { + "epoch": 0.82, + "grad_norm": 1.264048925451849, + "learning_rate": 1.6047970551215898e-06, + "loss": 0.5182, + "step": 10604 + }, + { + "epoch": 0.82, + "grad_norm": 1.3149038735128054, + "learning_rate": 1.603432114767246e-06, + "loss": 0.5147, + "step": 10605 + }, + { + "epoch": 0.82, + "grad_norm": 1.1976032123405373, + "learning_rate": 1.6020677045315558e-06, + "loss": 0.4532, + "step": 10606 + }, + { + "epoch": 0.82, + "grad_norm": 1.2281628998399992, + "learning_rate": 1.600703824500658e-06, + "loss": 0.5406, + "step": 10607 + }, + { + "epoch": 0.82, + "grad_norm": 1.2557731824280873, + "learning_rate": 1.5993404747606612e-06, + "loss": 0.5092, + "step": 10608 + }, + { + "epoch": 0.82, + "grad_norm": 1.2202247597855143, + "learning_rate": 1.5979776553976444e-06, + "loss": 0.5009, + "step": 10609 + }, + { + "epoch": 0.82, + "grad_norm": 1.260890109886381, + "learning_rate": 1.5966153664976447e-06, + "loss": 0.5499, + "step": 10610 + }, + { + "epoch": 0.82, + "grad_norm": 1.1915798373477975, + "learning_rate": 1.5952536081466752e-06, + "loss": 0.4717, + "step": 10611 + }, + { + "epoch": 0.82, + "grad_norm": 1.2130895872219338, + "learning_rate": 1.593892380430706e-06, + "loss": 0.4822, + "step": 10612 + }, + { + "epoch": 0.82, + "grad_norm": 1.1936408427396823, + "learning_rate": 1.5925316834356797e-06, + "loss": 0.4967, + "step": 10613 + }, + { + "epoch": 0.82, + "grad_norm": 1.113920918118061, + "learning_rate": 1.5911715172475062e-06, + "loss": 0.4447, + "step": 10614 + }, + { + "epoch": 0.82, + "grad_norm": 1.354397397069576, + "learning_rate": 1.589811881952058e-06, + "loss": 0.5498, + "step": 10615 + }, + { + "epoch": 0.82, + "grad_norm": 1.1929171128205, + "learning_rate": 1.5884527776351765e-06, + "loss": 0.4722, + "step": 10616 + }, + { + "epoch": 0.82, + "grad_norm": 1.1839754603297938, + "learning_rate": 1.587094204382672e-06, + "loss": 0.5038, + "step": 10617 + }, + { + "epoch": 0.82, + "grad_norm": 1.1277828236536438, + "learning_rate": 1.5857361622803124e-06, + "loss": 0.4634, + "step": 10618 + }, + { + "epoch": 0.82, + "grad_norm": 1.2086732430136258, + "learning_rate": 1.5843786514138438e-06, + "loss": 0.5452, + "step": 10619 + }, + { + "epoch": 0.82, + "grad_norm": 1.243875041024383, + "learning_rate": 1.5830216718689674e-06, + "loss": 0.4856, + "step": 10620 + }, + { + "epoch": 0.82, + "grad_norm": 1.1107088586063996, + "learning_rate": 1.5816652237313579e-06, + "loss": 0.4601, + "step": 10621 + }, + { + "epoch": 0.82, + "grad_norm": 1.141746753241139, + "learning_rate": 1.5803093070866582e-06, + "loss": 0.4985, + "step": 10622 + }, + { + "epoch": 0.82, + "grad_norm": 1.138993494274794, + "learning_rate": 1.5789539220204698e-06, + "loss": 0.4817, + "step": 10623 + }, + { + "epoch": 0.82, + "grad_norm": 1.2651400637830874, + "learning_rate": 1.5775990686183672e-06, + "loss": 0.5021, + "step": 10624 + }, + { + "epoch": 0.82, + "grad_norm": 1.303035626902938, + "learning_rate": 1.5762447469658892e-06, + "loss": 0.5787, + "step": 10625 + }, + { + "epoch": 0.82, + "grad_norm": 1.141180399666159, + "learning_rate": 1.5748909571485415e-06, + "loss": 0.4397, + "step": 10626 + }, + { + "epoch": 0.82, + "grad_norm": 1.2237553637877017, + "learning_rate": 1.573537699251797e-06, + "loss": 0.4826, + "step": 10627 + }, + { + "epoch": 0.82, + "grad_norm": 1.1775421947362295, + "learning_rate": 1.5721849733610905e-06, + "loss": 0.5236, + "step": 10628 + }, + { + "epoch": 0.82, + "grad_norm": 1.0834415210597546, + "learning_rate": 1.5708327795618317e-06, + "loss": 0.4447, + "step": 10629 + }, + { + "epoch": 0.82, + "grad_norm": 1.165043001895713, + "learning_rate": 1.5694811179393853e-06, + "loss": 0.5152, + "step": 10630 + }, + { + "epoch": 0.82, + "grad_norm": 1.2859387551877466, + "learning_rate": 1.5681299885790912e-06, + "loss": 0.4722, + "step": 10631 + }, + { + "epoch": 0.82, + "grad_norm": 1.2709186191811892, + "learning_rate": 1.5667793915662566e-06, + "loss": 0.5463, + "step": 10632 + }, + { + "epoch": 0.82, + "grad_norm": 1.161564803553512, + "learning_rate": 1.5654293269861464e-06, + "loss": 0.4952, + "step": 10633 + }, + { + "epoch": 0.82, + "grad_norm": 1.295246480586887, + "learning_rate": 1.5640797949239983e-06, + "loss": 0.5319, + "step": 10634 + }, + { + "epoch": 0.83, + "grad_norm": 1.2390392883358687, + "learning_rate": 1.5627307954650174e-06, + "loss": 0.4864, + "step": 10635 + }, + { + "epoch": 0.83, + "grad_norm": 1.1896794045976744, + "learning_rate": 1.5613823286943718e-06, + "loss": 0.5396, + "step": 10636 + }, + { + "epoch": 0.83, + "grad_norm": 1.2407145374187092, + "learning_rate": 1.5600343946971997e-06, + "loss": 0.542, + "step": 10637 + }, + { + "epoch": 0.83, + "grad_norm": 1.1586477831679753, + "learning_rate": 1.558686993558599e-06, + "loss": 0.493, + "step": 10638 + }, + { + "epoch": 0.83, + "grad_norm": 1.1317715828137198, + "learning_rate": 1.557340125363641e-06, + "loss": 0.4916, + "step": 10639 + }, + { + "epoch": 0.83, + "grad_norm": 1.250607247672225, + "learning_rate": 1.5559937901973575e-06, + "loss": 0.5091, + "step": 10640 + }, + { + "epoch": 0.83, + "grad_norm": 1.2454636275081175, + "learning_rate": 1.554647988144752e-06, + "loss": 0.4796, + "step": 10641 + }, + { + "epoch": 0.83, + "grad_norm": 1.1786074715424588, + "learning_rate": 1.5533027192907924e-06, + "loss": 0.5, + "step": 10642 + }, + { + "epoch": 0.83, + "grad_norm": 1.3496931963670329, + "learning_rate": 1.5519579837204103e-06, + "loss": 0.515, + "step": 10643 + }, + { + "epoch": 0.83, + "grad_norm": 1.155420314967855, + "learning_rate": 1.5506137815185063e-06, + "loss": 0.4891, + "step": 10644 + }, + { + "epoch": 0.83, + "grad_norm": 1.1867319434049721, + "learning_rate": 1.5492701127699472e-06, + "loss": 0.4803, + "step": 10645 + }, + { + "epoch": 0.83, + "grad_norm": 1.0859875564564818, + "learning_rate": 1.5479269775595652e-06, + "loss": 0.4627, + "step": 10646 + }, + { + "epoch": 0.83, + "grad_norm": 1.3417575169639346, + "learning_rate": 1.5465843759721633e-06, + "loss": 0.4812, + "step": 10647 + }, + { + "epoch": 0.83, + "grad_norm": 1.0905783307165362, + "learning_rate": 1.5452423080925017e-06, + "loss": 0.4835, + "step": 10648 + }, + { + "epoch": 0.83, + "grad_norm": 1.1912132732257752, + "learning_rate": 1.5439007740053158e-06, + "loss": 0.5001, + "step": 10649 + }, + { + "epoch": 0.83, + "grad_norm": 1.1660853481692872, + "learning_rate": 1.542559773795299e-06, + "loss": 0.5076, + "step": 10650 + }, + { + "epoch": 0.83, + "grad_norm": 1.2455843658633547, + "learning_rate": 1.5412193075471193e-06, + "loss": 0.5132, + "step": 10651 + }, + { + "epoch": 0.83, + "grad_norm": 1.2122231826191785, + "learning_rate": 1.5398793753454079e-06, + "loss": 0.5049, + "step": 10652 + }, + { + "epoch": 0.83, + "grad_norm": 1.2730598591847297, + "learning_rate": 1.5385399772747578e-06, + "loss": 0.5382, + "step": 10653 + }, + { + "epoch": 0.83, + "grad_norm": 1.2040212191394304, + "learning_rate": 1.537201113419735e-06, + "loss": 0.5187, + "step": 10654 + }, + { + "epoch": 0.83, + "grad_norm": 1.2002762595706624, + "learning_rate": 1.5358627838648676e-06, + "loss": 0.5191, + "step": 10655 + }, + { + "epoch": 0.83, + "grad_norm": 1.1650171141943202, + "learning_rate": 1.5345249886946512e-06, + "loss": 0.5007, + "step": 10656 + }, + { + "epoch": 0.83, + "grad_norm": 1.1814721862289355, + "learning_rate": 1.5331877279935515e-06, + "loss": 0.5262, + "step": 10657 + }, + { + "epoch": 0.83, + "grad_norm": 1.287374744063012, + "learning_rate": 1.5318510018459908e-06, + "loss": 0.5571, + "step": 10658 + }, + { + "epoch": 0.83, + "grad_norm": 1.0429881804867251, + "learning_rate": 1.5305148103363698e-06, + "loss": 0.4691, + "step": 10659 + }, + { + "epoch": 0.83, + "grad_norm": 1.2026368178732705, + "learning_rate": 1.5291791535490419e-06, + "loss": 0.5112, + "step": 10660 + }, + { + "epoch": 0.83, + "grad_norm": 1.1618844463750302, + "learning_rate": 1.5278440315683385e-06, + "loss": 0.4974, + "step": 10661 + }, + { + "epoch": 0.83, + "grad_norm": 1.279191247204793, + "learning_rate": 1.5265094444785544e-06, + "loss": 0.5216, + "step": 10662 + }, + { + "epoch": 0.83, + "grad_norm": 1.2477094167179268, + "learning_rate": 1.5251753923639435e-06, + "loss": 0.535, + "step": 10663 + }, + { + "epoch": 0.83, + "grad_norm": 1.1094570398217969, + "learning_rate": 1.5238418753087347e-06, + "loss": 0.4709, + "step": 10664 + }, + { + "epoch": 0.83, + "grad_norm": 1.150298602272318, + "learning_rate": 1.52250889339712e-06, + "loss": 0.5148, + "step": 10665 + }, + { + "epoch": 0.83, + "grad_norm": 1.2391737841051351, + "learning_rate": 1.5211764467132562e-06, + "loss": 0.5111, + "step": 10666 + }, + { + "epoch": 0.83, + "grad_norm": 1.2618902698063414, + "learning_rate": 1.5198445353412705e-06, + "loss": 0.5339, + "step": 10667 + }, + { + "epoch": 0.83, + "grad_norm": 1.2754243668069645, + "learning_rate": 1.5185131593652492e-06, + "loss": 0.5474, + "step": 10668 + }, + { + "epoch": 0.83, + "grad_norm": 1.1565161733720979, + "learning_rate": 1.5171823188692537e-06, + "loss": 0.4839, + "step": 10669 + }, + { + "epoch": 0.83, + "grad_norm": 1.2418106755116487, + "learning_rate": 1.515852013937301e-06, + "loss": 0.51, + "step": 10670 + }, + { + "epoch": 0.83, + "grad_norm": 1.1223652504326402, + "learning_rate": 1.5145222446533835e-06, + "loss": 0.5, + "step": 10671 + }, + { + "epoch": 0.83, + "grad_norm": 1.2221621163838106, + "learning_rate": 1.5131930111014558e-06, + "loss": 0.4724, + "step": 10672 + }, + { + "epoch": 0.83, + "grad_norm": 1.2141082107009922, + "learning_rate": 1.5118643133654421e-06, + "loss": 0.544, + "step": 10673 + }, + { + "epoch": 0.83, + "grad_norm": 1.2580655714118167, + "learning_rate": 1.5105361515292248e-06, + "loss": 0.4804, + "step": 10674 + }, + { + "epoch": 0.83, + "grad_norm": 1.2292671270319728, + "learning_rate": 1.5092085256766597e-06, + "loss": 0.5168, + "step": 10675 + }, + { + "epoch": 0.83, + "grad_norm": 1.0648835877924034, + "learning_rate": 1.5078814358915673e-06, + "loss": 0.4423, + "step": 10676 + }, + { + "epoch": 0.83, + "grad_norm": 1.1668641343348982, + "learning_rate": 1.5065548822577336e-06, + "loss": 0.4548, + "step": 10677 + }, + { + "epoch": 0.83, + "grad_norm": 1.1516953535320928, + "learning_rate": 1.5052288648589131e-06, + "loss": 0.4911, + "step": 10678 + }, + { + "epoch": 0.83, + "grad_norm": 1.219413625554132, + "learning_rate": 1.503903383778822e-06, + "loss": 0.5264, + "step": 10679 + }, + { + "epoch": 0.83, + "grad_norm": 1.2380641903451444, + "learning_rate": 1.502578439101141e-06, + "loss": 0.5702, + "step": 10680 + }, + { + "epoch": 0.83, + "grad_norm": 1.1643114271843449, + "learning_rate": 1.5012540309095247e-06, + "loss": 0.4866, + "step": 10681 + }, + { + "epoch": 0.83, + "grad_norm": 1.199663160653029, + "learning_rate": 1.4999301592875891e-06, + "loss": 0.529, + "step": 10682 + }, + { + "epoch": 0.83, + "grad_norm": 1.1609746441861055, + "learning_rate": 1.4986068243189188e-06, + "loss": 0.4953, + "step": 10683 + }, + { + "epoch": 0.83, + "grad_norm": 1.2624824031698814, + "learning_rate": 1.4972840260870603e-06, + "loss": 0.5146, + "step": 10684 + }, + { + "epoch": 0.83, + "grad_norm": 1.2543424519040083, + "learning_rate": 1.4959617646755276e-06, + "loss": 0.5037, + "step": 10685 + }, + { + "epoch": 0.83, + "grad_norm": 1.2434565940861557, + "learning_rate": 1.494640040167805e-06, + "loss": 0.5153, + "step": 10686 + }, + { + "epoch": 0.83, + "grad_norm": 1.2467298647897496, + "learning_rate": 1.4933188526473385e-06, + "loss": 0.4857, + "step": 10687 + }, + { + "epoch": 0.83, + "grad_norm": 1.161222334122599, + "learning_rate": 1.4919982021975432e-06, + "loss": 0.4957, + "step": 10688 + }, + { + "epoch": 0.83, + "grad_norm": 1.101285588515358, + "learning_rate": 1.490678088901798e-06, + "loss": 0.467, + "step": 10689 + }, + { + "epoch": 0.83, + "grad_norm": 1.1081466859437221, + "learning_rate": 1.489358512843444e-06, + "loss": 0.472, + "step": 10690 + }, + { + "epoch": 0.83, + "grad_norm": 1.142412900797478, + "learning_rate": 1.4880394741057968e-06, + "loss": 0.4944, + "step": 10691 + }, + { + "epoch": 0.83, + "grad_norm": 1.178900854301871, + "learning_rate": 1.4867209727721332e-06, + "loss": 0.4587, + "step": 10692 + }, + { + "epoch": 0.83, + "grad_norm": 1.22633194426983, + "learning_rate": 1.4854030089257e-06, + "loss": 0.5165, + "step": 10693 + }, + { + "epoch": 0.83, + "grad_norm": 1.154534123497548, + "learning_rate": 1.4840855826497013e-06, + "loss": 0.5071, + "step": 10694 + }, + { + "epoch": 0.83, + "grad_norm": 1.2420594731218504, + "learning_rate": 1.4827686940273178e-06, + "loss": 0.4821, + "step": 10695 + }, + { + "epoch": 0.83, + "grad_norm": 1.2937649583182145, + "learning_rate": 1.4814523431416882e-06, + "loss": 0.5263, + "step": 10696 + }, + { + "epoch": 0.83, + "grad_norm": 1.2260895807480467, + "learning_rate": 1.4801365300759253e-06, + "loss": 0.4687, + "step": 10697 + }, + { + "epoch": 0.83, + "grad_norm": 1.3067974914450806, + "learning_rate": 1.4788212549130964e-06, + "loss": 0.5591, + "step": 10698 + }, + { + "epoch": 0.83, + "grad_norm": 1.3168350562730655, + "learning_rate": 1.4775065177362492e-06, + "loss": 0.5635, + "step": 10699 + }, + { + "epoch": 0.83, + "grad_norm": 1.1561035888352202, + "learning_rate": 1.4761923186283822e-06, + "loss": 0.4237, + "step": 10700 + }, + { + "epoch": 0.83, + "grad_norm": 1.1750204470052563, + "learning_rate": 1.4748786576724716e-06, + "loss": 0.4973, + "step": 10701 + }, + { + "epoch": 0.83, + "grad_norm": 1.3664606832716801, + "learning_rate": 1.4735655349514555e-06, + "loss": 0.5479, + "step": 10702 + }, + { + "epoch": 0.83, + "grad_norm": 1.28754793837104, + "learning_rate": 1.4722529505482396e-06, + "loss": 0.5688, + "step": 10703 + }, + { + "epoch": 0.83, + "grad_norm": 1.3797582312281547, + "learning_rate": 1.47094090454569e-06, + "loss": 0.5157, + "step": 10704 + }, + { + "epoch": 0.83, + "grad_norm": 1.2582705119958655, + "learning_rate": 1.4696293970266463e-06, + "loss": 0.5462, + "step": 10705 + }, + { + "epoch": 0.83, + "grad_norm": 1.1306954431388179, + "learning_rate": 1.4683184280739082e-06, + "loss": 0.4743, + "step": 10706 + }, + { + "epoch": 0.83, + "grad_norm": 1.2099052705805882, + "learning_rate": 1.467007997770249e-06, + "loss": 0.493, + "step": 10707 + }, + { + "epoch": 0.83, + "grad_norm": 1.1512392602309827, + "learning_rate": 1.4656981061983966e-06, + "loss": 0.5054, + "step": 10708 + }, + { + "epoch": 0.83, + "grad_norm": 1.2088871979984055, + "learning_rate": 1.464388753441056e-06, + "loss": 0.5698, + "step": 10709 + }, + { + "epoch": 0.83, + "grad_norm": 1.2783778959379315, + "learning_rate": 1.4630799395808893e-06, + "loss": 0.577, + "step": 10710 + }, + { + "epoch": 0.83, + "grad_norm": 1.1232985239455786, + "learning_rate": 1.4617716647005298e-06, + "loss": 0.4855, + "step": 10711 + }, + { + "epoch": 0.83, + "grad_norm": 1.2508608238169414, + "learning_rate": 1.4604639288825773e-06, + "loss": 0.4672, + "step": 10712 + }, + { + "epoch": 0.83, + "grad_norm": 1.1214602200395811, + "learning_rate": 1.4591567322095978e-06, + "loss": 0.4392, + "step": 10713 + }, + { + "epoch": 0.83, + "grad_norm": 1.1259987424467393, + "learning_rate": 1.4578500747641167e-06, + "loss": 0.5118, + "step": 10714 + }, + { + "epoch": 0.83, + "grad_norm": 1.2225311020247238, + "learning_rate": 1.456543956628631e-06, + "loss": 0.4707, + "step": 10715 + }, + { + "epoch": 0.83, + "grad_norm": 1.2100539398781158, + "learning_rate": 1.455238377885605e-06, + "loss": 0.4967, + "step": 10716 + }, + { + "epoch": 0.83, + "grad_norm": 1.137771224080233, + "learning_rate": 1.453933338617467e-06, + "loss": 0.5059, + "step": 10717 + }, + { + "epoch": 0.83, + "grad_norm": 1.2057733493341938, + "learning_rate": 1.4526288389066068e-06, + "loss": 0.4884, + "step": 10718 + }, + { + "epoch": 0.83, + "grad_norm": 1.2189737750292187, + "learning_rate": 1.4513248788353894e-06, + "loss": 0.4715, + "step": 10719 + }, + { + "epoch": 0.83, + "grad_norm": 1.1831893985590594, + "learning_rate": 1.4500214584861349e-06, + "loss": 0.5097, + "step": 10720 + }, + { + "epoch": 0.83, + "grad_norm": 1.23139942637299, + "learning_rate": 1.4487185779411382e-06, + "loss": 0.4887, + "step": 10721 + }, + { + "epoch": 0.83, + "grad_norm": 1.3156255563476547, + "learning_rate": 1.4474162372826562e-06, + "loss": 0.5304, + "step": 10722 + }, + { + "epoch": 0.83, + "grad_norm": 1.2479036872880824, + "learning_rate": 1.446114436592916e-06, + "loss": 0.5598, + "step": 10723 + }, + { + "epoch": 0.83, + "grad_norm": 1.1843007811791368, + "learning_rate": 1.4448131759540996e-06, + "loss": 0.5057, + "step": 10724 + }, + { + "epoch": 0.83, + "grad_norm": 1.2374767012281316, + "learning_rate": 1.4435124554483671e-06, + "loss": 0.519, + "step": 10725 + }, + { + "epoch": 0.83, + "grad_norm": 1.240530624451376, + "learning_rate": 1.4422122751578394e-06, + "loss": 0.5137, + "step": 10726 + }, + { + "epoch": 0.83, + "grad_norm": 1.2339787390406027, + "learning_rate": 1.4409126351646052e-06, + "loss": 0.5416, + "step": 10727 + }, + { + "epoch": 0.83, + "grad_norm": 1.2254550575192762, + "learning_rate": 1.4396135355507135e-06, + "loss": 0.5091, + "step": 10728 + }, + { + "epoch": 0.83, + "grad_norm": 1.1919957437951312, + "learning_rate": 1.4383149763981863e-06, + "loss": 0.5135, + "step": 10729 + }, + { + "epoch": 0.83, + "grad_norm": 1.1402930207704394, + "learning_rate": 1.4370169577890059e-06, + "loss": 0.5007, + "step": 10730 + }, + { + "epoch": 0.83, + "grad_norm": 1.2649005653292626, + "learning_rate": 1.4357194798051244e-06, + "loss": 0.5077, + "step": 10731 + }, + { + "epoch": 0.83, + "grad_norm": 1.2354540876582532, + "learning_rate": 1.4344225425284565e-06, + "loss": 0.5043, + "step": 10732 + }, + { + "epoch": 0.83, + "grad_norm": 1.1919058330770496, + "learning_rate": 1.4331261460408874e-06, + "loss": 0.485, + "step": 10733 + }, + { + "epoch": 0.83, + "grad_norm": 1.2504249803994862, + "learning_rate": 1.4318302904242654e-06, + "loss": 0.4858, + "step": 10734 + }, + { + "epoch": 0.83, + "grad_norm": 1.1635475299702271, + "learning_rate": 1.4305349757604014e-06, + "loss": 0.5002, + "step": 10735 + }, + { + "epoch": 0.83, + "grad_norm": 1.2957456336237656, + "learning_rate": 1.429240202131077e-06, + "loss": 0.4974, + "step": 10736 + }, + { + "epoch": 0.83, + "grad_norm": 1.11815931827255, + "learning_rate": 1.4279459696180398e-06, + "loss": 0.4841, + "step": 10737 + }, + { + "epoch": 0.83, + "grad_norm": 1.3639096011608889, + "learning_rate": 1.4266522783029978e-06, + "loss": 0.5187, + "step": 10738 + }, + { + "epoch": 0.83, + "grad_norm": 1.2202920694631334, + "learning_rate": 1.4253591282676316e-06, + "loss": 0.5205, + "step": 10739 + }, + { + "epoch": 0.83, + "grad_norm": 1.1667114430872203, + "learning_rate": 1.424066519593581e-06, + "loss": 0.5079, + "step": 10740 + }, + { + "epoch": 0.83, + "grad_norm": 1.1430497198830127, + "learning_rate": 1.4227744523624575e-06, + "loss": 0.4168, + "step": 10741 + }, + { + "epoch": 0.83, + "grad_norm": 1.3526177228508531, + "learning_rate": 1.421482926655835e-06, + "loss": 0.5473, + "step": 10742 + }, + { + "epoch": 0.83, + "grad_norm": 1.2059125932393542, + "learning_rate": 1.4201919425552557e-06, + "loss": 0.478, + "step": 10743 + }, + { + "epoch": 0.83, + "grad_norm": 1.1257928597502185, + "learning_rate": 1.4189015001422257e-06, + "loss": 0.4847, + "step": 10744 + }, + { + "epoch": 0.83, + "grad_norm": 1.2357940725639691, + "learning_rate": 1.4176115994982153e-06, + "loss": 0.4943, + "step": 10745 + }, + { + "epoch": 0.83, + "grad_norm": 1.134541737306114, + "learning_rate": 1.416322240704664e-06, + "loss": 0.4747, + "step": 10746 + }, + { + "epoch": 0.83, + "grad_norm": 1.2373541601537699, + "learning_rate": 1.4150334238429776e-06, + "loss": 0.5243, + "step": 10747 + }, + { + "epoch": 0.83, + "grad_norm": 1.1749621608403984, + "learning_rate": 1.413745148994522e-06, + "loss": 0.4926, + "step": 10748 + }, + { + "epoch": 0.83, + "grad_norm": 1.211961030626305, + "learning_rate": 1.4124574162406356e-06, + "loss": 0.5169, + "step": 10749 + }, + { + "epoch": 0.83, + "grad_norm": 1.1651775984402475, + "learning_rate": 1.411170225662617e-06, + "loss": 0.4731, + "step": 10750 + }, + { + "epoch": 0.83, + "grad_norm": 1.2407771327141737, + "learning_rate": 1.409883577341734e-06, + "loss": 0.5455, + "step": 10751 + }, + { + "epoch": 0.83, + "grad_norm": 1.365969654997289, + "learning_rate": 1.4085974713592199e-06, + "loss": 0.5209, + "step": 10752 + }, + { + "epoch": 0.83, + "grad_norm": 1.2404837770581318, + "learning_rate": 1.4073119077962738e-06, + "loss": 0.5274, + "step": 10753 + }, + { + "epoch": 0.83, + "grad_norm": 1.0377283350613595, + "learning_rate": 1.4060268867340621e-06, + "loss": 0.4271, + "step": 10754 + }, + { + "epoch": 0.83, + "grad_norm": 1.1906535788482806, + "learning_rate": 1.4047424082537086e-06, + "loss": 0.5017, + "step": 10755 + }, + { + "epoch": 0.83, + "grad_norm": 1.2003521859407196, + "learning_rate": 1.4034584724363131e-06, + "loss": 0.5402, + "step": 10756 + }, + { + "epoch": 0.83, + "grad_norm": 1.1219848387776727, + "learning_rate": 1.4021750793629397e-06, + "loss": 0.4741, + "step": 10757 + }, + { + "epoch": 0.83, + "grad_norm": 1.196548222087757, + "learning_rate": 1.4008922291146087e-06, + "loss": 0.4337, + "step": 10758 + }, + { + "epoch": 0.83, + "grad_norm": 1.1567767721771123, + "learning_rate": 1.3996099217723202e-06, + "loss": 0.4881, + "step": 10759 + }, + { + "epoch": 0.83, + "grad_norm": 1.1268658529831788, + "learning_rate": 1.3983281574170271e-06, + "loss": 0.4501, + "step": 10760 + }, + { + "epoch": 0.83, + "grad_norm": 1.2337646423986006, + "learning_rate": 1.3970469361296557e-06, + "loss": 0.4722, + "step": 10761 + }, + { + "epoch": 0.83, + "grad_norm": 1.251927130037995, + "learning_rate": 1.3957662579910969e-06, + "loss": 0.4932, + "step": 10762 + }, + { + "epoch": 0.83, + "grad_norm": 1.2588248118317495, + "learning_rate": 1.394486123082206e-06, + "loss": 0.4983, + "step": 10763 + }, + { + "epoch": 0.84, + "grad_norm": 1.3571947964431077, + "learning_rate": 1.3932065314838071e-06, + "loss": 0.5309, + "step": 10764 + }, + { + "epoch": 0.84, + "grad_norm": 1.1875720253233086, + "learning_rate": 1.3919274832766838e-06, + "loss": 0.4871, + "step": 10765 + }, + { + "epoch": 0.84, + "grad_norm": 1.2049814244180048, + "learning_rate": 1.390648978541589e-06, + "loss": 0.489, + "step": 10766 + }, + { + "epoch": 0.84, + "grad_norm": 1.1682337329563806, + "learning_rate": 1.3893710173592457e-06, + "loss": 0.5027, + "step": 10767 + }, + { + "epoch": 0.84, + "grad_norm": 1.2058632146528905, + "learning_rate": 1.3880935998103317e-06, + "loss": 0.5019, + "step": 10768 + }, + { + "epoch": 0.84, + "grad_norm": 1.1302106244932704, + "learning_rate": 1.386816725975504e-06, + "loss": 0.4566, + "step": 10769 + }, + { + "epoch": 0.84, + "grad_norm": 1.2500268456437313, + "learning_rate": 1.385540395935372e-06, + "loss": 0.5164, + "step": 10770 + }, + { + "epoch": 0.84, + "grad_norm": 1.2638334144509422, + "learning_rate": 1.3842646097705193e-06, + "loss": 0.4984, + "step": 10771 + }, + { + "epoch": 0.84, + "grad_norm": 1.2933552328611988, + "learning_rate": 1.3829893675614924e-06, + "loss": 0.4955, + "step": 10772 + }, + { + "epoch": 0.84, + "grad_norm": 1.0794999573898838, + "learning_rate": 1.381714669388805e-06, + "loss": 0.471, + "step": 10773 + }, + { + "epoch": 0.84, + "grad_norm": 1.0502251497201327, + "learning_rate": 1.380440515332938e-06, + "loss": 0.4248, + "step": 10774 + }, + { + "epoch": 0.84, + "grad_norm": 1.2664682852007916, + "learning_rate": 1.3791669054743295e-06, + "loss": 0.5356, + "step": 10775 + }, + { + "epoch": 0.84, + "grad_norm": 1.2468282991999549, + "learning_rate": 1.3778938398933927e-06, + "loss": 0.5251, + "step": 10776 + }, + { + "epoch": 0.84, + "grad_norm": 1.1621070156554307, + "learning_rate": 1.3766213186705036e-06, + "loss": 0.4653, + "step": 10777 + }, + { + "epoch": 0.84, + "grad_norm": 1.294201496602907, + "learning_rate": 1.3753493418859987e-06, + "loss": 0.5818, + "step": 10778 + }, + { + "epoch": 0.84, + "grad_norm": 1.2030049549010255, + "learning_rate": 1.3740779096201883e-06, + "loss": 0.5253, + "step": 10779 + }, + { + "epoch": 0.84, + "grad_norm": 1.2743483953986934, + "learning_rate": 1.372807021953345e-06, + "loss": 0.5098, + "step": 10780 + }, + { + "epoch": 0.84, + "grad_norm": 1.2291839188911362, + "learning_rate": 1.3715366789657025e-06, + "loss": 0.5083, + "step": 10781 + }, + { + "epoch": 0.84, + "grad_norm": 1.2275395480620062, + "learning_rate": 1.3702668807374664e-06, + "loss": 0.5127, + "step": 10782 + }, + { + "epoch": 0.84, + "grad_norm": 1.235526212024401, + "learning_rate": 1.368997627348806e-06, + "loss": 0.5532, + "step": 10783 + }, + { + "epoch": 0.84, + "grad_norm": 1.149829151067626, + "learning_rate": 1.367728918879856e-06, + "loss": 0.4838, + "step": 10784 + }, + { + "epoch": 0.84, + "grad_norm": 1.2707991621499186, + "learning_rate": 1.3664607554107178e-06, + "loss": 0.5214, + "step": 10785 + }, + { + "epoch": 0.84, + "grad_norm": 1.1669629719800076, + "learning_rate": 1.3651931370214533e-06, + "loss": 0.4473, + "step": 10786 + }, + { + "epoch": 0.84, + "grad_norm": 1.1423184613295985, + "learning_rate": 1.3639260637920971e-06, + "loss": 0.4627, + "step": 10787 + }, + { + "epoch": 0.84, + "grad_norm": 1.1849397364716565, + "learning_rate": 1.3626595358026429e-06, + "loss": 0.4989, + "step": 10788 + }, + { + "epoch": 0.84, + "grad_norm": 1.3661531295707572, + "learning_rate": 1.3613935531330558e-06, + "loss": 0.5729, + "step": 10789 + }, + { + "epoch": 0.84, + "grad_norm": 1.1020821502236913, + "learning_rate": 1.360128115863265e-06, + "loss": 0.4742, + "step": 10790 + }, + { + "epoch": 0.84, + "grad_norm": 1.198002599315339, + "learning_rate": 1.3588632240731591e-06, + "loss": 0.531, + "step": 10791 + }, + { + "epoch": 0.84, + "grad_norm": 1.2487884849292366, + "learning_rate": 1.3575988778426008e-06, + "loss": 0.4717, + "step": 10792 + }, + { + "epoch": 0.84, + "grad_norm": 1.2853957303823997, + "learning_rate": 1.356335077251415e-06, + "loss": 0.5587, + "step": 10793 + }, + { + "epoch": 0.84, + "grad_norm": 1.16087185659976, + "learning_rate": 1.35507182237939e-06, + "loss": 0.4719, + "step": 10794 + }, + { + "epoch": 0.84, + "grad_norm": 1.0618740369625115, + "learning_rate": 1.3538091133062858e-06, + "loss": 0.4866, + "step": 10795 + }, + { + "epoch": 0.84, + "grad_norm": 1.4409372450580755, + "learning_rate": 1.3525469501118183e-06, + "loss": 0.5358, + "step": 10796 + }, + { + "epoch": 0.84, + "grad_norm": 1.2967474541507673, + "learning_rate": 1.3512853328756792e-06, + "loss": 0.5178, + "step": 10797 + }, + { + "epoch": 0.84, + "grad_norm": 1.1611695151467718, + "learning_rate": 1.350024261677516e-06, + "loss": 0.4664, + "step": 10798 + }, + { + "epoch": 0.84, + "grad_norm": 1.1657886380037057, + "learning_rate": 1.3487637365969497e-06, + "loss": 0.5048, + "step": 10799 + }, + { + "epoch": 0.84, + "grad_norm": 1.3451269768405352, + "learning_rate": 1.347503757713564e-06, + "loss": 0.5801, + "step": 10800 + }, + { + "epoch": 0.84, + "grad_norm": 1.1820919467548117, + "learning_rate": 1.3462443251069069e-06, + "loss": 0.4986, + "step": 10801 + }, + { + "epoch": 0.84, + "grad_norm": 1.175646904148628, + "learning_rate": 1.344985438856492e-06, + "loss": 0.4639, + "step": 10802 + }, + { + "epoch": 0.84, + "grad_norm": 1.2158919347785935, + "learning_rate": 1.3437270990418005e-06, + "loss": 0.4836, + "step": 10803 + }, + { + "epoch": 0.84, + "grad_norm": 1.1412074548451747, + "learning_rate": 1.3424693057422778e-06, + "loss": 0.4691, + "step": 10804 + }, + { + "epoch": 0.84, + "grad_norm": 1.2210774328468081, + "learning_rate": 1.3412120590373368e-06, + "loss": 0.4906, + "step": 10805 + }, + { + "epoch": 0.84, + "grad_norm": 1.3015930575187398, + "learning_rate": 1.3399553590063496e-06, + "loss": 0.4965, + "step": 10806 + }, + { + "epoch": 0.84, + "grad_norm": 1.2948082691375216, + "learning_rate": 1.338699205728663e-06, + "loss": 0.5724, + "step": 10807 + }, + { + "epoch": 0.84, + "grad_norm": 1.1802022455142054, + "learning_rate": 1.3374435992835798e-06, + "loss": 0.4926, + "step": 10808 + }, + { + "epoch": 0.84, + "grad_norm": 1.1665733958608424, + "learning_rate": 1.3361885397503749e-06, + "loss": 0.5317, + "step": 10809 + }, + { + "epoch": 0.84, + "grad_norm": 1.2625637000603989, + "learning_rate": 1.334934027208289e-06, + "loss": 0.5471, + "step": 10810 + }, + { + "epoch": 0.84, + "grad_norm": 1.3277707244485306, + "learning_rate": 1.333680061736522e-06, + "loss": 0.5281, + "step": 10811 + }, + { + "epoch": 0.84, + "grad_norm": 1.2786571970924778, + "learning_rate": 1.3324266434142452e-06, + "loss": 0.5368, + "step": 10812 + }, + { + "epoch": 0.84, + "grad_norm": 1.3105343677361685, + "learning_rate": 1.331173772320593e-06, + "loss": 0.5353, + "step": 10813 + }, + { + "epoch": 0.84, + "grad_norm": 1.213867531221668, + "learning_rate": 1.3299214485346657e-06, + "loss": 0.536, + "step": 10814 + }, + { + "epoch": 0.84, + "grad_norm": 1.1527844636919835, + "learning_rate": 1.3286696721355308e-06, + "loss": 0.5245, + "step": 10815 + }, + { + "epoch": 0.84, + "grad_norm": 1.3641731820313083, + "learning_rate": 1.3274184432022163e-06, + "loss": 0.5001, + "step": 10816 + }, + { + "epoch": 0.84, + "grad_norm": 1.1386973670998044, + "learning_rate": 1.3261677618137225e-06, + "loss": 0.5058, + "step": 10817 + }, + { + "epoch": 0.84, + "grad_norm": 1.208461765610928, + "learning_rate": 1.3249176280490062e-06, + "loss": 0.5306, + "step": 10818 + }, + { + "epoch": 0.84, + "grad_norm": 1.19888057351629, + "learning_rate": 1.3236680419869974e-06, + "loss": 0.5333, + "step": 10819 + }, + { + "epoch": 0.84, + "grad_norm": 1.17742789410155, + "learning_rate": 1.322419003706592e-06, + "loss": 0.5125, + "step": 10820 + }, + { + "epoch": 0.84, + "grad_norm": 1.4860746591124063, + "learning_rate": 1.3211705132866425e-06, + "loss": 0.5186, + "step": 10821 + }, + { + "epoch": 0.84, + "grad_norm": 1.2032560301930184, + "learning_rate": 1.319922570805976e-06, + "loss": 0.4705, + "step": 10822 + }, + { + "epoch": 0.84, + "grad_norm": 1.0833945379329721, + "learning_rate": 1.3186751763433803e-06, + "loss": 0.4589, + "step": 10823 + }, + { + "epoch": 0.84, + "grad_norm": 1.1397373455706485, + "learning_rate": 1.3174283299776103e-06, + "loss": 0.4907, + "step": 10824 + }, + { + "epoch": 0.84, + "grad_norm": 1.2937090272911513, + "learning_rate": 1.3161820317873886e-06, + "loss": 0.5254, + "step": 10825 + }, + { + "epoch": 0.84, + "grad_norm": 1.2711673933288596, + "learning_rate": 1.3149362818513955e-06, + "loss": 0.4953, + "step": 10826 + }, + { + "epoch": 0.84, + "grad_norm": 1.2915378731946339, + "learning_rate": 1.3136910802482862e-06, + "loss": 0.5293, + "step": 10827 + }, + { + "epoch": 0.84, + "grad_norm": 1.238818656720174, + "learning_rate": 1.3124464270566727e-06, + "loss": 0.4607, + "step": 10828 + }, + { + "epoch": 0.84, + "grad_norm": 1.2065017180606452, + "learning_rate": 1.3112023223551374e-06, + "loss": 0.5077, + "step": 10829 + }, + { + "epoch": 0.84, + "grad_norm": 1.1675112029034365, + "learning_rate": 1.3099587662222302e-06, + "loss": 0.4655, + "step": 10830 + }, + { + "epoch": 0.84, + "grad_norm": 1.1661638073077676, + "learning_rate": 1.3087157587364596e-06, + "loss": 0.4585, + "step": 10831 + }, + { + "epoch": 0.84, + "grad_norm": 1.175343530230575, + "learning_rate": 1.3074732999763029e-06, + "loss": 0.5753, + "step": 10832 + }, + { + "epoch": 0.84, + "grad_norm": 1.1335883279114707, + "learning_rate": 1.306231390020205e-06, + "loss": 0.4821, + "step": 10833 + }, + { + "epoch": 0.84, + "grad_norm": 1.2057703833679223, + "learning_rate": 1.3049900289465733e-06, + "loss": 0.5463, + "step": 10834 + }, + { + "epoch": 0.84, + "grad_norm": 1.0682319411687036, + "learning_rate": 1.303749216833784e-06, + "loss": 0.456, + "step": 10835 + }, + { + "epoch": 0.84, + "grad_norm": 1.247203846640096, + "learning_rate": 1.30250895376017e-06, + "loss": 0.5179, + "step": 10836 + }, + { + "epoch": 0.84, + "grad_norm": 1.1588513777910028, + "learning_rate": 1.3012692398040416e-06, + "loss": 0.4801, + "step": 10837 + }, + { + "epoch": 0.84, + "grad_norm": 1.2390458787762963, + "learning_rate": 1.3000300750436645e-06, + "loss": 0.573, + "step": 10838 + }, + { + "epoch": 0.84, + "grad_norm": 1.1628762671955695, + "learning_rate": 1.2987914595572738e-06, + "loss": 0.5235, + "step": 10839 + }, + { + "epoch": 0.84, + "grad_norm": 1.2102574069137875, + "learning_rate": 1.297553393423071e-06, + "loss": 0.5248, + "step": 10840 + }, + { + "epoch": 0.84, + "grad_norm": 1.1891242513775562, + "learning_rate": 1.296315876719223e-06, + "loss": 0.499, + "step": 10841 + }, + { + "epoch": 0.84, + "grad_norm": 1.3099195909210037, + "learning_rate": 1.295078909523857e-06, + "loss": 0.5159, + "step": 10842 + }, + { + "epoch": 0.84, + "grad_norm": 1.1284373012258375, + "learning_rate": 1.2938424919150705e-06, + "loss": 0.5088, + "step": 10843 + }, + { + "epoch": 0.84, + "grad_norm": 1.188858610259817, + "learning_rate": 1.2926066239709256e-06, + "loss": 0.5223, + "step": 10844 + }, + { + "epoch": 0.84, + "grad_norm": 1.190536481715684, + "learning_rate": 1.2913713057694498e-06, + "loss": 0.5015, + "step": 10845 + }, + { + "epoch": 0.84, + "grad_norm": 1.2773474363687967, + "learning_rate": 1.2901365373886331e-06, + "loss": 0.5389, + "step": 10846 + }, + { + "epoch": 0.84, + "grad_norm": 1.2536104988387609, + "learning_rate": 1.2889023189064364e-06, + "loss": 0.5022, + "step": 10847 + }, + { + "epoch": 0.84, + "grad_norm": 1.208849429486005, + "learning_rate": 1.2876686504007764e-06, + "loss": 0.4989, + "step": 10848 + }, + { + "epoch": 0.84, + "grad_norm": 1.1443225032280204, + "learning_rate": 1.2864355319495448e-06, + "loss": 0.4371, + "step": 10849 + }, + { + "epoch": 0.84, + "grad_norm": 1.1924101055498921, + "learning_rate": 1.285202963630594e-06, + "loss": 0.5226, + "step": 10850 + }, + { + "epoch": 0.84, + "grad_norm": 1.2944129645776967, + "learning_rate": 1.2839709455217453e-06, + "loss": 0.5451, + "step": 10851 + }, + { + "epoch": 0.84, + "grad_norm": 1.2238177063148723, + "learning_rate": 1.2827394777007774e-06, + "loss": 0.5202, + "step": 10852 + }, + { + "epoch": 0.84, + "grad_norm": 1.1125773027972965, + "learning_rate": 1.2815085602454401e-06, + "loss": 0.4273, + "step": 10853 + }, + { + "epoch": 0.84, + "grad_norm": 1.2329779819679016, + "learning_rate": 1.2802781932334495e-06, + "loss": 0.5011, + "step": 10854 + }, + { + "epoch": 0.84, + "grad_norm": 1.157375715598917, + "learning_rate": 1.2790483767424878e-06, + "loss": 0.4952, + "step": 10855 + }, + { + "epoch": 0.84, + "grad_norm": 1.310867293140858, + "learning_rate": 1.2778191108501925e-06, + "loss": 0.5292, + "step": 10856 + }, + { + "epoch": 0.84, + "grad_norm": 1.193135142135876, + "learning_rate": 1.2765903956341807e-06, + "loss": 0.4734, + "step": 10857 + }, + { + "epoch": 0.84, + "grad_norm": 1.15281770948524, + "learning_rate": 1.2753622311720203e-06, + "loss": 0.4928, + "step": 10858 + }, + { + "epoch": 0.84, + "grad_norm": 1.2165344837097434, + "learning_rate": 1.2741346175412571e-06, + "loss": 0.4891, + "step": 10859 + }, + { + "epoch": 0.84, + "grad_norm": 1.3436418977768463, + "learning_rate": 1.2729075548193947e-06, + "loss": 0.6186, + "step": 10860 + }, + { + "epoch": 0.84, + "grad_norm": 1.1532568982784446, + "learning_rate": 1.2716810430839056e-06, + "loss": 0.5101, + "step": 10861 + }, + { + "epoch": 0.84, + "grad_norm": 1.2762731329215957, + "learning_rate": 1.270455082412223e-06, + "loss": 0.5155, + "step": 10862 + }, + { + "epoch": 0.84, + "grad_norm": 1.1895692264413007, + "learning_rate": 1.2692296728817487e-06, + "loss": 0.5011, + "step": 10863 + }, + { + "epoch": 0.84, + "grad_norm": 1.2093423489049155, + "learning_rate": 1.268004814569851e-06, + "loss": 0.5906, + "step": 10864 + }, + { + "epoch": 0.84, + "grad_norm": 1.1630280269679925, + "learning_rate": 1.266780507553863e-06, + "loss": 0.4933, + "step": 10865 + }, + { + "epoch": 0.84, + "grad_norm": 1.017427107428367, + "learning_rate": 1.2655567519110756e-06, + "loss": 0.4223, + "step": 10866 + }, + { + "epoch": 0.84, + "grad_norm": 1.1516943184550408, + "learning_rate": 1.2643335477187567e-06, + "loss": 0.5277, + "step": 10867 + }, + { + "epoch": 0.84, + "grad_norm": 1.1864273346336496, + "learning_rate": 1.2631108950541303e-06, + "loss": 0.4524, + "step": 10868 + }, + { + "epoch": 0.84, + "grad_norm": 1.1194909630905223, + "learning_rate": 1.261888793994388e-06, + "loss": 0.5079, + "step": 10869 + }, + { + "epoch": 0.84, + "grad_norm": 1.1862825376753299, + "learning_rate": 1.260667244616689e-06, + "loss": 0.4726, + "step": 10870 + }, + { + "epoch": 0.84, + "grad_norm": 1.122631546842427, + "learning_rate": 1.2594462469981582e-06, + "loss": 0.4809, + "step": 10871 + }, + { + "epoch": 0.84, + "grad_norm": 1.098077659216121, + "learning_rate": 1.2582258012158799e-06, + "loss": 0.4574, + "step": 10872 + }, + { + "epoch": 0.84, + "grad_norm": 1.1962322112607366, + "learning_rate": 1.2570059073469076e-06, + "loss": 0.4667, + "step": 10873 + }, + { + "epoch": 0.84, + "grad_norm": 1.2463609175838648, + "learning_rate": 1.25578656546826e-06, + "loss": 0.5413, + "step": 10874 + }, + { + "epoch": 0.84, + "grad_norm": 1.2412484896401919, + "learning_rate": 1.254567775656923e-06, + "loss": 0.4892, + "step": 10875 + }, + { + "epoch": 0.84, + "grad_norm": 1.1532129145658276, + "learning_rate": 1.2533495379898407e-06, + "loss": 0.4851, + "step": 10876 + }, + { + "epoch": 0.84, + "grad_norm": 1.1250474707866498, + "learning_rate": 1.252131852543932e-06, + "loss": 0.471, + "step": 10877 + }, + { + "epoch": 0.84, + "grad_norm": 1.241355425380615, + "learning_rate": 1.250914719396069e-06, + "loss": 0.5385, + "step": 10878 + }, + { + "epoch": 0.84, + "grad_norm": 1.1449373846132629, + "learning_rate": 1.2496981386231e-06, + "loss": 0.472, + "step": 10879 + }, + { + "epoch": 0.84, + "grad_norm": 1.2023882901652603, + "learning_rate": 1.2484821103018329e-06, + "loss": 0.5339, + "step": 10880 + }, + { + "epoch": 0.84, + "grad_norm": 1.168978914496552, + "learning_rate": 1.2472666345090435e-06, + "loss": 0.4872, + "step": 10881 + }, + { + "epoch": 0.84, + "grad_norm": 1.106973278005702, + "learning_rate": 1.2460517113214688e-06, + "loss": 0.4774, + "step": 10882 + }, + { + "epoch": 0.84, + "grad_norm": 1.0929780143034311, + "learning_rate": 1.2448373408158133e-06, + "loss": 0.4846, + "step": 10883 + }, + { + "epoch": 0.84, + "grad_norm": 1.2446006511296048, + "learning_rate": 1.2436235230687466e-06, + "loss": 0.5218, + "step": 10884 + }, + { + "epoch": 0.84, + "grad_norm": 1.138021712080953, + "learning_rate": 1.2424102581569064e-06, + "loss": 0.4933, + "step": 10885 + }, + { + "epoch": 0.84, + "grad_norm": 1.1251463794844778, + "learning_rate": 1.2411975461568881e-06, + "loss": 0.493, + "step": 10886 + }, + { + "epoch": 0.84, + "grad_norm": 1.2287749242728667, + "learning_rate": 1.2399853871452605e-06, + "loss": 0.514, + "step": 10887 + }, + { + "epoch": 0.84, + "grad_norm": 1.206398955719468, + "learning_rate": 1.2387737811985479e-06, + "loss": 0.4976, + "step": 10888 + }, + { + "epoch": 0.84, + "grad_norm": 1.208941580611243, + "learning_rate": 1.23756272839325e-06, + "loss": 0.5029, + "step": 10889 + }, + { + "epoch": 0.84, + "grad_norm": 1.255789984748273, + "learning_rate": 1.2363522288058238e-06, + "loss": 0.5212, + "step": 10890 + }, + { + "epoch": 0.84, + "grad_norm": 1.258270038292831, + "learning_rate": 1.2351422825126969e-06, + "loss": 0.4995, + "step": 10891 + }, + { + "epoch": 0.84, + "grad_norm": 1.2292192200024226, + "learning_rate": 1.2339328895902603e-06, + "loss": 0.5207, + "step": 10892 + }, + { + "epoch": 0.85, + "grad_norm": 1.1894136870161656, + "learning_rate": 1.2327240501148651e-06, + "loss": 0.4845, + "step": 10893 + }, + { + "epoch": 0.85, + "grad_norm": 1.2650543621881953, + "learning_rate": 1.2315157641628338e-06, + "loss": 0.5372, + "step": 10894 + }, + { + "epoch": 0.85, + "grad_norm": 1.198015684375401, + "learning_rate": 1.2303080318104533e-06, + "loss": 0.4852, + "step": 10895 + }, + { + "epoch": 0.85, + "grad_norm": 1.1911042878256393, + "learning_rate": 1.229100853133971e-06, + "loss": 0.4934, + "step": 10896 + }, + { + "epoch": 0.85, + "grad_norm": 1.2255815118352613, + "learning_rate": 1.2278942282096063e-06, + "loss": 0.4829, + "step": 10897 + }, + { + "epoch": 0.85, + "grad_norm": 1.2046207196716854, + "learning_rate": 1.2266881571135337e-06, + "loss": 0.5593, + "step": 10898 + }, + { + "epoch": 0.85, + "grad_norm": 1.1641985410524214, + "learning_rate": 1.2254826399219032e-06, + "loss": 0.473, + "step": 10899 + }, + { + "epoch": 0.85, + "grad_norm": 1.309079527007699, + "learning_rate": 1.2242776767108233e-06, + "loss": 0.5477, + "step": 10900 + }, + { + "epoch": 0.85, + "grad_norm": 1.2956451191423466, + "learning_rate": 1.2230732675563705e-06, + "loss": 0.5157, + "step": 10901 + }, + { + "epoch": 0.85, + "grad_norm": 1.318197147136756, + "learning_rate": 1.2218694125345887e-06, + "loss": 0.5097, + "step": 10902 + }, + { + "epoch": 0.85, + "grad_norm": 1.2899736363282295, + "learning_rate": 1.2206661117214768e-06, + "loss": 0.5074, + "step": 10903 + }, + { + "epoch": 0.85, + "grad_norm": 1.1875261002734072, + "learning_rate": 1.2194633651930089e-06, + "loss": 0.447, + "step": 10904 + }, + { + "epoch": 0.85, + "grad_norm": 1.1686532556812843, + "learning_rate": 1.2182611730251225e-06, + "loss": 0.4576, + "step": 10905 + }, + { + "epoch": 0.85, + "grad_norm": 1.1182941211206503, + "learning_rate": 1.2170595352937142e-06, + "loss": 0.4488, + "step": 10906 + }, + { + "epoch": 0.85, + "grad_norm": 1.26196277254576, + "learning_rate": 1.215858452074653e-06, + "loss": 0.4595, + "step": 10907 + }, + { + "epoch": 0.85, + "grad_norm": 1.26661469181406, + "learning_rate": 1.214657923443766e-06, + "loss": 0.5371, + "step": 10908 + }, + { + "epoch": 0.85, + "grad_norm": 1.1408964382818523, + "learning_rate": 1.2134579494768507e-06, + "loss": 0.4682, + "step": 10909 + }, + { + "epoch": 0.85, + "grad_norm": 1.1716665972731792, + "learning_rate": 1.2122585302496682e-06, + "loss": 0.4829, + "step": 10910 + }, + { + "epoch": 0.85, + "grad_norm": 1.2747453397855322, + "learning_rate": 1.2110596658379426e-06, + "loss": 0.5306, + "step": 10911 + }, + { + "epoch": 0.85, + "grad_norm": 1.2727065065318452, + "learning_rate": 1.2098613563173678e-06, + "loss": 0.5048, + "step": 10912 + }, + { + "epoch": 0.85, + "grad_norm": 1.0808364385963682, + "learning_rate": 1.2086636017635955e-06, + "loss": 0.4559, + "step": 10913 + }, + { + "epoch": 0.85, + "grad_norm": 1.1184747389258711, + "learning_rate": 1.2074664022522464e-06, + "loss": 0.4905, + "step": 10914 + }, + { + "epoch": 0.85, + "grad_norm": 1.1918389707099555, + "learning_rate": 1.2062697578589089e-06, + "loss": 0.4776, + "step": 10915 + }, + { + "epoch": 0.85, + "grad_norm": 1.1282726476531821, + "learning_rate": 1.2050736686591292e-06, + "loss": 0.4776, + "step": 10916 + }, + { + "epoch": 0.85, + "grad_norm": 1.1985809916668981, + "learning_rate": 1.2038781347284266e-06, + "loss": 0.4855, + "step": 10917 + }, + { + "epoch": 0.85, + "grad_norm": 1.2888363726557173, + "learning_rate": 1.202683156142278e-06, + "loss": 0.5117, + "step": 10918 + }, + { + "epoch": 0.85, + "grad_norm": 1.3553973352269555, + "learning_rate": 1.2014887329761293e-06, + "loss": 0.4993, + "step": 10919 + }, + { + "epoch": 0.85, + "grad_norm": 1.2015193777692599, + "learning_rate": 1.2002948653053915e-06, + "loss": 0.5126, + "step": 10920 + }, + { + "epoch": 0.85, + "grad_norm": 1.2364997925378338, + "learning_rate": 1.1991015532054395e-06, + "loss": 0.5, + "step": 10921 + }, + { + "epoch": 0.85, + "grad_norm": 1.1539568973240086, + "learning_rate": 1.1979087967516146e-06, + "loss": 0.4963, + "step": 10922 + }, + { + "epoch": 0.85, + "grad_norm": 1.2573862241992435, + "learning_rate": 1.1967165960192185e-06, + "loss": 0.5441, + "step": 10923 + }, + { + "epoch": 0.85, + "grad_norm": 1.2326866385415258, + "learning_rate": 1.1955249510835232e-06, + "loss": 0.4765, + "step": 10924 + }, + { + "epoch": 0.85, + "grad_norm": 1.2356714128606008, + "learning_rate": 1.1943338620197642e-06, + "loss": 0.4759, + "step": 10925 + }, + { + "epoch": 0.85, + "grad_norm": 1.3008147455891728, + "learning_rate": 1.193143328903138e-06, + "loss": 0.484, + "step": 10926 + }, + { + "epoch": 0.85, + "grad_norm": 1.2964257070996899, + "learning_rate": 1.1919533518088121e-06, + "loss": 0.5137, + "step": 10927 + }, + { + "epoch": 0.85, + "grad_norm": 1.148413054536943, + "learning_rate": 1.1907639308119134e-06, + "loss": 0.4726, + "step": 10928 + }, + { + "epoch": 0.85, + "grad_norm": 1.2314015561466816, + "learning_rate": 1.1895750659875372e-06, + "loss": 0.5329, + "step": 10929 + }, + { + "epoch": 0.85, + "grad_norm": 1.2561606228287285, + "learning_rate": 1.1883867574107433e-06, + "loss": 0.5213, + "step": 10930 + }, + { + "epoch": 0.85, + "grad_norm": 1.160572632330765, + "learning_rate": 1.1871990051565551e-06, + "loss": 0.4701, + "step": 10931 + }, + { + "epoch": 0.85, + "grad_norm": 1.1548002794497303, + "learning_rate": 1.186011809299964e-06, + "loss": 0.535, + "step": 10932 + }, + { + "epoch": 0.85, + "grad_norm": 1.091084146994098, + "learning_rate": 1.1848251699159185e-06, + "loss": 0.437, + "step": 10933 + }, + { + "epoch": 0.85, + "grad_norm": 1.3319150364987997, + "learning_rate": 1.1836390870793414e-06, + "loss": 0.5169, + "step": 10934 + }, + { + "epoch": 0.85, + "grad_norm": 1.301156158947863, + "learning_rate": 1.1824535608651177e-06, + "loss": 0.5143, + "step": 10935 + }, + { + "epoch": 0.85, + "grad_norm": 1.2165011173561442, + "learning_rate": 1.1812685913480904e-06, + "loss": 0.5389, + "step": 10936 + }, + { + "epoch": 0.85, + "grad_norm": 1.2067531524579094, + "learning_rate": 1.180084178603077e-06, + "loss": 0.4974, + "step": 10937 + }, + { + "epoch": 0.85, + "grad_norm": 1.1960559103952126, + "learning_rate": 1.1789003227048533e-06, + "loss": 0.4589, + "step": 10938 + }, + { + "epoch": 0.85, + "grad_norm": 1.2521339797674484, + "learning_rate": 1.1777170237281633e-06, + "loss": 0.5686, + "step": 10939 + }, + { + "epoch": 0.85, + "grad_norm": 1.2981094092735477, + "learning_rate": 1.1765342817477133e-06, + "loss": 0.5404, + "step": 10940 + }, + { + "epoch": 0.85, + "grad_norm": 1.216255912727184, + "learning_rate": 1.1753520968381782e-06, + "loss": 0.4981, + "step": 10941 + }, + { + "epoch": 0.85, + "grad_norm": 1.1948853770552794, + "learning_rate": 1.1741704690741961e-06, + "loss": 0.5024, + "step": 10942 + }, + { + "epoch": 0.85, + "grad_norm": 1.2560423722135725, + "learning_rate": 1.1729893985303653e-06, + "loss": 0.537, + "step": 10943 + }, + { + "epoch": 0.85, + "grad_norm": 1.2590150948332368, + "learning_rate": 1.171808885281256e-06, + "loss": 0.501, + "step": 10944 + }, + { + "epoch": 0.85, + "grad_norm": 1.240003236874078, + "learning_rate": 1.1706289294014005e-06, + "loss": 0.5355, + "step": 10945 + }, + { + "epoch": 0.85, + "grad_norm": 1.139649012809811, + "learning_rate": 1.169449530965292e-06, + "loss": 0.4487, + "step": 10946 + }, + { + "epoch": 0.85, + "grad_norm": 1.2459014457191502, + "learning_rate": 1.168270690047395e-06, + "loss": 0.5409, + "step": 10947 + }, + { + "epoch": 0.85, + "grad_norm": 1.178618799636493, + "learning_rate": 1.1670924067221367e-06, + "loss": 0.499, + "step": 10948 + }, + { + "epoch": 0.85, + "grad_norm": 1.146812431021523, + "learning_rate": 1.1659146810639043e-06, + "loss": 0.5009, + "step": 10949 + }, + { + "epoch": 0.85, + "grad_norm": 1.2949706193544122, + "learning_rate": 1.1647375131470562e-06, + "loss": 0.4855, + "step": 10950 + }, + { + "epoch": 0.85, + "grad_norm": 1.2314065901425824, + "learning_rate": 1.1635609030459127e-06, + "loss": 0.532, + "step": 10951 + }, + { + "epoch": 0.85, + "grad_norm": 1.1257868769953112, + "learning_rate": 1.1623848508347603e-06, + "loss": 0.4624, + "step": 10952 + }, + { + "epoch": 0.85, + "grad_norm": 1.134018880468286, + "learning_rate": 1.1612093565878502e-06, + "loss": 0.5205, + "step": 10953 + }, + { + "epoch": 0.85, + "grad_norm": 1.3056232956746885, + "learning_rate": 1.1600344203793922e-06, + "loss": 0.585, + "step": 10954 + }, + { + "epoch": 0.85, + "grad_norm": 1.0941822287873288, + "learning_rate": 1.1588600422835728e-06, + "loss": 0.4613, + "step": 10955 + }, + { + "epoch": 0.85, + "grad_norm": 1.3033279370702722, + "learning_rate": 1.157686222374531e-06, + "loss": 0.5306, + "step": 10956 + }, + { + "epoch": 0.85, + "grad_norm": 1.1631717730284945, + "learning_rate": 1.156512960726378e-06, + "loss": 0.5017, + "step": 10957 + }, + { + "epoch": 0.85, + "grad_norm": 1.1042728492975822, + "learning_rate": 1.15534025741319e-06, + "loss": 0.5094, + "step": 10958 + }, + { + "epoch": 0.85, + "grad_norm": 1.1033958167470623, + "learning_rate": 1.1541681125090031e-06, + "loss": 0.4616, + "step": 10959 + }, + { + "epoch": 0.85, + "grad_norm": 1.2065354597474753, + "learning_rate": 1.1529965260878207e-06, + "loss": 0.5443, + "step": 10960 + }, + { + "epoch": 0.85, + "grad_norm": 1.2799111791430533, + "learning_rate": 1.1518254982236121e-06, + "loss": 0.512, + "step": 10961 + }, + { + "epoch": 0.85, + "grad_norm": 1.2412995337584116, + "learning_rate": 1.1506550289903107e-06, + "loss": 0.5096, + "step": 10962 + }, + { + "epoch": 0.85, + "grad_norm": 1.1974353763289312, + "learning_rate": 1.149485118461816e-06, + "loss": 0.4898, + "step": 10963 + }, + { + "epoch": 0.85, + "grad_norm": 1.1782457750173052, + "learning_rate": 1.148315766711986e-06, + "loss": 0.4912, + "step": 10964 + }, + { + "epoch": 0.85, + "grad_norm": 1.2835463437911176, + "learning_rate": 1.1471469738146534e-06, + "loss": 0.5307, + "step": 10965 + }, + { + "epoch": 0.85, + "grad_norm": 1.1431393541005277, + "learning_rate": 1.145978739843604e-06, + "loss": 0.4886, + "step": 10966 + }, + { + "epoch": 0.85, + "grad_norm": 1.3437348741966413, + "learning_rate": 1.1448110648725974e-06, + "loss": 0.4795, + "step": 10967 + }, + { + "epoch": 0.85, + "grad_norm": 1.1581387429572234, + "learning_rate": 1.1436439489753581e-06, + "loss": 0.4731, + "step": 10968 + }, + { + "epoch": 0.85, + "grad_norm": 1.1961365894541909, + "learning_rate": 1.1424773922255662e-06, + "loss": 0.488, + "step": 10969 + }, + { + "epoch": 0.85, + "grad_norm": 1.3464965695813065, + "learning_rate": 1.1413113946968756e-06, + "loss": 0.5224, + "step": 10970 + }, + { + "epoch": 0.85, + "grad_norm": 1.2091670229591636, + "learning_rate": 1.1401459564629013e-06, + "loss": 0.5662, + "step": 10971 + }, + { + "epoch": 0.85, + "grad_norm": 1.235892316954457, + "learning_rate": 1.1389810775972244e-06, + "loss": 0.5629, + "step": 10972 + }, + { + "epoch": 0.85, + "grad_norm": 1.2497525447049591, + "learning_rate": 1.1378167581733901e-06, + "loss": 0.5027, + "step": 10973 + }, + { + "epoch": 0.85, + "grad_norm": 1.2061069245551066, + "learning_rate": 1.1366529982649043e-06, + "loss": 0.5051, + "step": 10974 + }, + { + "epoch": 0.85, + "grad_norm": 1.2388547898823448, + "learning_rate": 1.1354897979452472e-06, + "loss": 0.5195, + "step": 10975 + }, + { + "epoch": 0.85, + "grad_norm": 1.1407296642202176, + "learning_rate": 1.13432715728785e-06, + "loss": 0.4769, + "step": 10976 + }, + { + "epoch": 0.85, + "grad_norm": 1.212526570845022, + "learning_rate": 1.1331650763661217e-06, + "loss": 0.5651, + "step": 10977 + }, + { + "epoch": 0.85, + "grad_norm": 1.2109669343385894, + "learning_rate": 1.1320035552534304e-06, + "loss": 0.5194, + "step": 10978 + }, + { + "epoch": 0.85, + "grad_norm": 1.152942876725815, + "learning_rate": 1.130842594023106e-06, + "loss": 0.452, + "step": 10979 + }, + { + "epoch": 0.85, + "grad_norm": 1.0769316108334046, + "learning_rate": 1.129682192748447e-06, + "loss": 0.5106, + "step": 10980 + }, + { + "epoch": 0.85, + "grad_norm": 1.2568610721615041, + "learning_rate": 1.1285223515027155e-06, + "loss": 0.5194, + "step": 10981 + }, + { + "epoch": 0.85, + "grad_norm": 1.2569610841655474, + "learning_rate": 1.12736307035914e-06, + "loss": 0.5669, + "step": 10982 + }, + { + "epoch": 0.85, + "grad_norm": 1.2924073720682197, + "learning_rate": 1.1262043493909113e-06, + "loss": 0.5731, + "step": 10983 + }, + { + "epoch": 0.85, + "grad_norm": 1.1220376430765475, + "learning_rate": 1.125046188671184e-06, + "loss": 0.5025, + "step": 10984 + }, + { + "epoch": 0.85, + "grad_norm": 1.255659689682778, + "learning_rate": 1.123888588273081e-06, + "loss": 0.5235, + "step": 10985 + }, + { + "epoch": 0.85, + "grad_norm": 1.2200607196362419, + "learning_rate": 1.1227315482696844e-06, + "loss": 0.4696, + "step": 10986 + }, + { + "epoch": 0.85, + "grad_norm": 1.2064723724003361, + "learning_rate": 1.1215750687340455e-06, + "loss": 0.4811, + "step": 10987 + }, + { + "epoch": 0.85, + "grad_norm": 1.1083721947609209, + "learning_rate": 1.1204191497391815e-06, + "loss": 0.4939, + "step": 10988 + }, + { + "epoch": 0.85, + "grad_norm": 1.2110291477100763, + "learning_rate": 1.119263791358066e-06, + "loss": 0.5515, + "step": 10989 + }, + { + "epoch": 0.85, + "grad_norm": 1.2576024608694756, + "learning_rate": 1.118108993663647e-06, + "loss": 0.546, + "step": 10990 + }, + { + "epoch": 0.85, + "grad_norm": 1.1645289772919507, + "learning_rate": 1.1169547567288319e-06, + "loss": 0.4856, + "step": 10991 + }, + { + "epoch": 0.85, + "grad_norm": 1.0279983349723543, + "learning_rate": 1.115801080626493e-06, + "loss": 0.4487, + "step": 10992 + }, + { + "epoch": 0.85, + "grad_norm": 1.210358265810039, + "learning_rate": 1.1146479654294706e-06, + "loss": 0.5128, + "step": 10993 + }, + { + "epoch": 0.85, + "grad_norm": 1.2835304620781385, + "learning_rate": 1.1134954112105645e-06, + "loss": 0.5464, + "step": 10994 + }, + { + "epoch": 0.85, + "grad_norm": 1.2349588968796155, + "learning_rate": 1.1123434180425396e-06, + "loss": 0.494, + "step": 10995 + }, + { + "epoch": 0.85, + "grad_norm": 1.2615550971410479, + "learning_rate": 1.1111919859981291e-06, + "loss": 0.5084, + "step": 10996 + }, + { + "epoch": 0.85, + "grad_norm": 1.3699009589433255, + "learning_rate": 1.1100411151500279e-06, + "loss": 0.5049, + "step": 10997 + }, + { + "epoch": 0.85, + "grad_norm": 1.1878724267099376, + "learning_rate": 1.1088908055709003e-06, + "loss": 0.564, + "step": 10998 + }, + { + "epoch": 0.85, + "grad_norm": 1.2212558807106515, + "learning_rate": 1.1077410573333659e-06, + "loss": 0.5244, + "step": 10999 + }, + { + "epoch": 0.85, + "grad_norm": 1.1960268067924829, + "learning_rate": 1.1065918705100164e-06, + "loss": 0.4708, + "step": 11000 + }, + { + "epoch": 0.85, + "grad_norm": 1.236982464702292, + "learning_rate": 1.1054432451734053e-06, + "loss": 0.5139, + "step": 11001 + }, + { + "epoch": 0.85, + "grad_norm": 1.2934956471031154, + "learning_rate": 1.1042951813960535e-06, + "loss": 0.499, + "step": 11002 + }, + { + "epoch": 0.85, + "grad_norm": 1.185914738251953, + "learning_rate": 1.1031476792504436e-06, + "loss": 0.4702, + "step": 11003 + }, + { + "epoch": 0.85, + "grad_norm": 1.3610791015537416, + "learning_rate": 1.1020007388090227e-06, + "loss": 0.5566, + "step": 11004 + }, + { + "epoch": 0.85, + "grad_norm": 1.1509418300409455, + "learning_rate": 1.1008543601442012e-06, + "loss": 0.4898, + "step": 11005 + }, + { + "epoch": 0.85, + "grad_norm": 1.0690610488752403, + "learning_rate": 1.0997085433283572e-06, + "loss": 0.4547, + "step": 11006 + }, + { + "epoch": 0.85, + "grad_norm": 1.172408224227052, + "learning_rate": 1.0985632884338327e-06, + "loss": 0.5034, + "step": 11007 + }, + { + "epoch": 0.85, + "grad_norm": 1.152837149815908, + "learning_rate": 1.097418595532933e-06, + "loss": 0.5033, + "step": 11008 + }, + { + "epoch": 0.85, + "grad_norm": 1.18503761980242, + "learning_rate": 1.0962744646979296e-06, + "loss": 0.5057, + "step": 11009 + }, + { + "epoch": 0.85, + "grad_norm": 1.3533746114708962, + "learning_rate": 1.0951308960010554e-06, + "loss": 0.5416, + "step": 11010 + }, + { + "epoch": 0.85, + "grad_norm": 1.1421855545549224, + "learning_rate": 1.0939878895145107e-06, + "loss": 0.4833, + "step": 11011 + }, + { + "epoch": 0.85, + "grad_norm": 1.2718850250223959, + "learning_rate": 1.0928454453104597e-06, + "loss": 0.5475, + "step": 11012 + }, + { + "epoch": 0.85, + "grad_norm": 1.1922615858184182, + "learning_rate": 1.0917035634610328e-06, + "loss": 0.5016, + "step": 11013 + }, + { + "epoch": 0.85, + "grad_norm": 1.230251571623262, + "learning_rate": 1.09056224403832e-06, + "loss": 0.5077, + "step": 11014 + }, + { + "epoch": 0.85, + "grad_norm": 1.129394795128725, + "learning_rate": 1.0894214871143783e-06, + "loss": 0.4975, + "step": 11015 + }, + { + "epoch": 0.85, + "grad_norm": 1.1328901066998234, + "learning_rate": 1.0882812927612297e-06, + "loss": 0.4733, + "step": 11016 + }, + { + "epoch": 0.85, + "grad_norm": 1.2175706390160521, + "learning_rate": 1.0871416610508622e-06, + "loss": 0.5217, + "step": 11017 + }, + { + "epoch": 0.85, + "grad_norm": 1.2137954457473987, + "learning_rate": 1.0860025920552252e-06, + "loss": 0.4689, + "step": 11018 + }, + { + "epoch": 0.85, + "grad_norm": 1.2838029780316345, + "learning_rate": 1.0848640858462378e-06, + "loss": 0.5279, + "step": 11019 + }, + { + "epoch": 0.85, + "grad_norm": 1.1396354668017237, + "learning_rate": 1.0837261424957735e-06, + "loss": 0.4426, + "step": 11020 + }, + { + "epoch": 0.85, + "grad_norm": 1.2505302258315805, + "learning_rate": 1.0825887620756804e-06, + "loss": 0.5382, + "step": 11021 + }, + { + "epoch": 0.86, + "grad_norm": 1.226496360599236, + "learning_rate": 1.0814519446577665e-06, + "loss": 0.5071, + "step": 11022 + }, + { + "epoch": 0.86, + "grad_norm": 1.2339909112895453, + "learning_rate": 1.0803156903138069e-06, + "loss": 0.5321, + "step": 11023 + }, + { + "epoch": 0.86, + "grad_norm": 1.1292351372335896, + "learning_rate": 1.0791799991155371e-06, + "loss": 0.4608, + "step": 11024 + }, + { + "epoch": 0.86, + "grad_norm": 1.178304910456379, + "learning_rate": 1.078044871134658e-06, + "loss": 0.4722, + "step": 11025 + }, + { + "epoch": 0.86, + "grad_norm": 1.0947160541666423, + "learning_rate": 1.0769103064428366e-06, + "loss": 0.4624, + "step": 11026 + }, + { + "epoch": 0.86, + "grad_norm": 1.1775900276254219, + "learning_rate": 1.0757763051117055e-06, + "loss": 0.5176, + "step": 11027 + }, + { + "epoch": 0.86, + "grad_norm": 1.2413996471231832, + "learning_rate": 1.0746428672128583e-06, + "loss": 0.4981, + "step": 11028 + }, + { + "epoch": 0.86, + "grad_norm": 1.1791926476589687, + "learning_rate": 1.0735099928178584e-06, + "loss": 0.4937, + "step": 11029 + }, + { + "epoch": 0.86, + "grad_norm": 1.4016056083930608, + "learning_rate": 1.0723776819982257e-06, + "loss": 0.5165, + "step": 11030 + }, + { + "epoch": 0.86, + "grad_norm": 1.3500335071078888, + "learning_rate": 1.0712459348254488e-06, + "loss": 0.5668, + "step": 11031 + }, + { + "epoch": 0.86, + "grad_norm": 1.297593848228336, + "learning_rate": 1.070114751370984e-06, + "loss": 0.5351, + "step": 11032 + }, + { + "epoch": 0.86, + "grad_norm": 1.2029376193684853, + "learning_rate": 1.0689841317062478e-06, + "loss": 0.5146, + "step": 11033 + }, + { + "epoch": 0.86, + "grad_norm": 1.2583092603001815, + "learning_rate": 1.0678540759026224e-06, + "loss": 0.5115, + "step": 11034 + }, + { + "epoch": 0.86, + "grad_norm": 1.1310975346117904, + "learning_rate": 1.06672458403145e-06, + "loss": 0.4644, + "step": 11035 + }, + { + "epoch": 0.86, + "grad_norm": 1.155860061877357, + "learning_rate": 1.0655956561640456e-06, + "loss": 0.4531, + "step": 11036 + }, + { + "epoch": 0.86, + "grad_norm": 1.2350248243948958, + "learning_rate": 1.0644672923716815e-06, + "loss": 0.5187, + "step": 11037 + }, + { + "epoch": 0.86, + "grad_norm": 1.1229920481691575, + "learning_rate": 1.063339492725599e-06, + "loss": 0.4594, + "step": 11038 + }, + { + "epoch": 0.86, + "grad_norm": 1.2306060714389633, + "learning_rate": 1.062212257297004e-06, + "loss": 0.5002, + "step": 11039 + }, + { + "epoch": 0.86, + "grad_norm": 1.1198946291879468, + "learning_rate": 1.061085586157059e-06, + "loss": 0.4899, + "step": 11040 + }, + { + "epoch": 0.86, + "grad_norm": 1.106094345090458, + "learning_rate": 1.0599594793769007e-06, + "loss": 0.4546, + "step": 11041 + }, + { + "epoch": 0.86, + "grad_norm": 1.0821773829745938, + "learning_rate": 1.0588339370276246e-06, + "loss": 0.4569, + "step": 11042 + }, + { + "epoch": 0.86, + "grad_norm": 1.1525582324064279, + "learning_rate": 1.0577089591802946e-06, + "loss": 0.5114, + "step": 11043 + }, + { + "epoch": 0.86, + "grad_norm": 1.2049383889272243, + "learning_rate": 1.0565845459059343e-06, + "loss": 0.5388, + "step": 11044 + }, + { + "epoch": 0.86, + "grad_norm": 1.2112109460036242, + "learning_rate": 1.055460697275531e-06, + "loss": 0.4769, + "step": 11045 + }, + { + "epoch": 0.86, + "grad_norm": 1.2140897514593778, + "learning_rate": 1.0543374133600414e-06, + "loss": 0.5327, + "step": 11046 + }, + { + "epoch": 0.86, + "grad_norm": 1.2406017810560912, + "learning_rate": 1.0532146942303856e-06, + "loss": 0.4614, + "step": 11047 + }, + { + "epoch": 0.86, + "grad_norm": 1.1999057454604056, + "learning_rate": 1.0520925399574445e-06, + "loss": 0.5037, + "step": 11048 + }, + { + "epoch": 0.86, + "grad_norm": 1.3304978659238906, + "learning_rate": 1.050970950612069e-06, + "loss": 0.5186, + "step": 11049 + }, + { + "epoch": 0.86, + "grad_norm": 1.313559785496592, + "learning_rate": 1.0498499262650664e-06, + "loss": 0.5166, + "step": 11050 + }, + { + "epoch": 0.86, + "grad_norm": 1.4039201721961587, + "learning_rate": 1.048729466987214e-06, + "loss": 0.5652, + "step": 11051 + }, + { + "epoch": 0.86, + "grad_norm": 1.1895264350836758, + "learning_rate": 1.047609572849253e-06, + "loss": 0.4987, + "step": 11052 + }, + { + "epoch": 0.86, + "grad_norm": 1.2395595366600698, + "learning_rate": 1.0464902439218905e-06, + "loss": 0.4588, + "step": 11053 + }, + { + "epoch": 0.86, + "grad_norm": 1.1489206452585214, + "learning_rate": 1.0453714802757908e-06, + "loss": 0.4597, + "step": 11054 + }, + { + "epoch": 0.86, + "grad_norm": 1.131600882978287, + "learning_rate": 1.0442532819815908e-06, + "loss": 0.4819, + "step": 11055 + }, + { + "epoch": 0.86, + "grad_norm": 1.239627960213853, + "learning_rate": 1.0431356491098854e-06, + "loss": 0.5627, + "step": 11056 + }, + { + "epoch": 0.86, + "grad_norm": 1.228975776893027, + "learning_rate": 1.0420185817312377e-06, + "loss": 0.5407, + "step": 11057 + }, + { + "epoch": 0.86, + "grad_norm": 1.3174200031525334, + "learning_rate": 1.040902079916174e-06, + "loss": 0.4887, + "step": 11058 + }, + { + "epoch": 0.86, + "grad_norm": 1.124350678295888, + "learning_rate": 1.0397861437351842e-06, + "loss": 0.5002, + "step": 11059 + }, + { + "epoch": 0.86, + "grad_norm": 1.4144751858090445, + "learning_rate": 1.0386707732587265e-06, + "loss": 0.5842, + "step": 11060 + }, + { + "epoch": 0.86, + "grad_norm": 1.2034571300679713, + "learning_rate": 1.0375559685572145e-06, + "loss": 0.4942, + "step": 11061 + }, + { + "epoch": 0.86, + "grad_norm": 1.2950542031362913, + "learning_rate": 1.0364417297010354e-06, + "loss": 0.5092, + "step": 11062 + }, + { + "epoch": 0.86, + "grad_norm": 1.300741796496622, + "learning_rate": 1.0353280567605373e-06, + "loss": 0.4402, + "step": 11063 + }, + { + "epoch": 0.86, + "grad_norm": 1.1856222364426632, + "learning_rate": 1.0342149498060284e-06, + "loss": 0.4901, + "step": 11064 + }, + { + "epoch": 0.86, + "grad_norm": 1.2007141233505538, + "learning_rate": 1.033102408907789e-06, + "loss": 0.4733, + "step": 11065 + }, + { + "epoch": 0.86, + "grad_norm": 1.2115407118118497, + "learning_rate": 1.0319904341360554e-06, + "loss": 0.5184, + "step": 11066 + }, + { + "epoch": 0.86, + "grad_norm": 1.2610145706012588, + "learning_rate": 1.0308790255610356e-06, + "loss": 0.5298, + "step": 11067 + }, + { + "epoch": 0.86, + "grad_norm": 1.1800056685295375, + "learning_rate": 1.0297681832528971e-06, + "loss": 0.4939, + "step": 11068 + }, + { + "epoch": 0.86, + "grad_norm": 1.130417811703316, + "learning_rate": 1.0286579072817726e-06, + "loss": 0.4671, + "step": 11069 + }, + { + "epoch": 0.86, + "grad_norm": 1.337200170404787, + "learning_rate": 1.0275481977177638e-06, + "loss": 0.5938, + "step": 11070 + }, + { + "epoch": 0.86, + "grad_norm": 1.1450482655642957, + "learning_rate": 1.0264390546309254e-06, + "loss": 0.4816, + "step": 11071 + }, + { + "epoch": 0.86, + "grad_norm": 1.179749493517386, + "learning_rate": 1.0253304780912887e-06, + "loss": 0.4636, + "step": 11072 + }, + { + "epoch": 0.86, + "grad_norm": 1.1485406219479515, + "learning_rate": 1.024222468168843e-06, + "loss": 0.4796, + "step": 11073 + }, + { + "epoch": 0.86, + "grad_norm": 1.2519898312524385, + "learning_rate": 1.023115024933541e-06, + "loss": 0.5411, + "step": 11074 + }, + { + "epoch": 0.86, + "grad_norm": 1.196609391902362, + "learning_rate": 1.0220081484553025e-06, + "loss": 0.4884, + "step": 11075 + }, + { + "epoch": 0.86, + "grad_norm": 1.20362318549424, + "learning_rate": 1.0209018388040093e-06, + "loss": 0.4696, + "step": 11076 + }, + { + "epoch": 0.86, + "grad_norm": 1.3149237732496482, + "learning_rate": 1.019796096049508e-06, + "loss": 0.5093, + "step": 11077 + }, + { + "epoch": 0.86, + "grad_norm": 1.1467307766653911, + "learning_rate": 1.0186909202616114e-06, + "loss": 0.4702, + "step": 11078 + }, + { + "epoch": 0.86, + "grad_norm": 1.1576955368124666, + "learning_rate": 1.017586311510095e-06, + "loss": 0.4703, + "step": 11079 + }, + { + "epoch": 0.86, + "grad_norm": 1.216619634807926, + "learning_rate": 1.0164822698646992e-06, + "loss": 0.5605, + "step": 11080 + }, + { + "epoch": 0.86, + "grad_norm": 1.2133083146436283, + "learning_rate": 1.0153787953951245e-06, + "loss": 0.5514, + "step": 11081 + }, + { + "epoch": 0.86, + "grad_norm": 1.4289492175700447, + "learning_rate": 1.0142758881710413e-06, + "loss": 0.5101, + "step": 11082 + }, + { + "epoch": 0.86, + "grad_norm": 1.2952194217237716, + "learning_rate": 1.013173548262083e-06, + "loss": 0.5515, + "step": 11083 + }, + { + "epoch": 0.86, + "grad_norm": 1.1800721911799206, + "learning_rate": 1.0120717757378428e-06, + "loss": 0.4534, + "step": 11084 + }, + { + "epoch": 0.86, + "grad_norm": 1.3216230670926006, + "learning_rate": 1.0109705706678862e-06, + "loss": 0.5408, + "step": 11085 + }, + { + "epoch": 0.86, + "grad_norm": 1.2886630277396456, + "learning_rate": 1.009869933121731e-06, + "loss": 0.4884, + "step": 11086 + }, + { + "epoch": 0.86, + "grad_norm": 1.2858279713444503, + "learning_rate": 1.0087698631688713e-06, + "loss": 0.5274, + "step": 11087 + }, + { + "epoch": 0.86, + "grad_norm": 1.1717983475093516, + "learning_rate": 1.0076703608787575e-06, + "loss": 0.5071, + "step": 11088 + }, + { + "epoch": 0.86, + "grad_norm": 1.311829577382489, + "learning_rate": 1.0065714263208092e-06, + "loss": 0.5615, + "step": 11089 + }, + { + "epoch": 0.86, + "grad_norm": 1.238725889261915, + "learning_rate": 1.005473059564408e-06, + "loss": 0.4752, + "step": 11090 + }, + { + "epoch": 0.86, + "grad_norm": 1.1352940706100734, + "learning_rate": 1.004375260678897e-06, + "loss": 0.4839, + "step": 11091 + }, + { + "epoch": 0.86, + "grad_norm": 1.2168244041892375, + "learning_rate": 1.0032780297335886e-06, + "loss": 0.4637, + "step": 11092 + }, + { + "epoch": 0.86, + "grad_norm": 1.1732857414278512, + "learning_rate": 1.002181366797753e-06, + "loss": 0.5696, + "step": 11093 + }, + { + "epoch": 0.86, + "grad_norm": 1.2030735252565226, + "learning_rate": 1.0010852719406306e-06, + "loss": 0.5126, + "step": 11094 + }, + { + "epoch": 0.86, + "grad_norm": 1.1032855578928529, + "learning_rate": 9.999897452314256e-07, + "loss": 0.4557, + "step": 11095 + }, + { + "epoch": 0.86, + "grad_norm": 1.2141840575645526, + "learning_rate": 9.988947867392995e-07, + "loss": 0.4811, + "step": 11096 + }, + { + "epoch": 0.86, + "grad_norm": 1.1192433574718834, + "learning_rate": 9.978003965333849e-07, + "loss": 0.4859, + "step": 11097 + }, + { + "epoch": 0.86, + "grad_norm": 1.2169969127932458, + "learning_rate": 9.967065746827764e-07, + "loss": 0.5249, + "step": 11098 + }, + { + "epoch": 0.86, + "grad_norm": 1.2591108172839338, + "learning_rate": 9.956133212565332e-07, + "loss": 0.4623, + "step": 11099 + }, + { + "epoch": 0.86, + "grad_norm": 1.2404433667571837, + "learning_rate": 9.945206363236804e-07, + "loss": 0.5624, + "step": 11100 + }, + { + "epoch": 0.86, + "grad_norm": 1.139843346306126, + "learning_rate": 9.93428519953199e-07, + "loss": 0.5046, + "step": 11101 + }, + { + "epoch": 0.86, + "grad_norm": 1.0546810573804961, + "learning_rate": 9.92336972214044e-07, + "loss": 0.4624, + "step": 11102 + }, + { + "epoch": 0.86, + "grad_norm": 1.2546564157704336, + "learning_rate": 9.912459931751296e-07, + "loss": 0.466, + "step": 11103 + }, + { + "epoch": 0.86, + "grad_norm": 1.2656281082680292, + "learning_rate": 9.901555829053333e-07, + "loss": 0.5338, + "step": 11104 + }, + { + "epoch": 0.86, + "grad_norm": 1.3375466900100528, + "learning_rate": 9.890657414735017e-07, + "loss": 0.5899, + "step": 11105 + }, + { + "epoch": 0.86, + "grad_norm": 1.2502162269496075, + "learning_rate": 9.879764689484383e-07, + "loss": 0.5204, + "step": 11106 + }, + { + "epoch": 0.86, + "grad_norm": 1.153685432024286, + "learning_rate": 9.868877653989161e-07, + "loss": 0.4511, + "step": 11107 + }, + { + "epoch": 0.86, + "grad_norm": 1.1333907657451077, + "learning_rate": 9.857996308936713e-07, + "loss": 0.4797, + "step": 11108 + }, + { + "epoch": 0.86, + "grad_norm": 1.2702463806136817, + "learning_rate": 9.847120655014032e-07, + "loss": 0.5111, + "step": 11109 + }, + { + "epoch": 0.86, + "grad_norm": 1.1980189183035805, + "learning_rate": 9.836250692907745e-07, + "loss": 0.471, + "step": 11110 + }, + { + "epoch": 0.86, + "grad_norm": 1.1265073850062017, + "learning_rate": 9.825386423304162e-07, + "loss": 0.4891, + "step": 11111 + }, + { + "epoch": 0.86, + "grad_norm": 1.192428000653777, + "learning_rate": 9.814527846889165e-07, + "loss": 0.5272, + "step": 11112 + }, + { + "epoch": 0.86, + "grad_norm": 1.0941625634431469, + "learning_rate": 9.80367496434831e-07, + "loss": 0.4911, + "step": 11113 + }, + { + "epoch": 0.86, + "grad_norm": 1.319738138985596, + "learning_rate": 9.792827776366797e-07, + "loss": 0.5725, + "step": 11114 + }, + { + "epoch": 0.86, + "grad_norm": 1.1911902560196133, + "learning_rate": 9.781986283629484e-07, + "loss": 0.4553, + "step": 11115 + }, + { + "epoch": 0.86, + "grad_norm": 1.2118604034914062, + "learning_rate": 9.771150486820857e-07, + "loss": 0.4901, + "step": 11116 + }, + { + "epoch": 0.86, + "grad_norm": 1.1574768053371496, + "learning_rate": 9.760320386625e-07, + "loss": 0.4629, + "step": 11117 + }, + { + "epoch": 0.86, + "grad_norm": 1.1492088996885166, + "learning_rate": 9.749495983725688e-07, + "loss": 0.3864, + "step": 11118 + }, + { + "epoch": 0.86, + "grad_norm": 1.292519113779314, + "learning_rate": 9.73867727880633e-07, + "loss": 0.4772, + "step": 11119 + }, + { + "epoch": 0.86, + "grad_norm": 1.204174277777636, + "learning_rate": 9.72786427254996e-07, + "loss": 0.5069, + "step": 11120 + }, + { + "epoch": 0.86, + "grad_norm": 1.1778397394371976, + "learning_rate": 9.717056965639281e-07, + "loss": 0.4645, + "step": 11121 + }, + { + "epoch": 0.86, + "grad_norm": 1.3041749050054863, + "learning_rate": 9.7062553587566e-07, + "loss": 0.5567, + "step": 11122 + }, + { + "epoch": 0.86, + "grad_norm": 1.21181072621425, + "learning_rate": 9.695459452583843e-07, + "loss": 0.5382, + "step": 11123 + }, + { + "epoch": 0.86, + "grad_norm": 1.22384610033517, + "learning_rate": 9.684669247802647e-07, + "loss": 0.5001, + "step": 11124 + }, + { + "epoch": 0.86, + "grad_norm": 1.136770869892159, + "learning_rate": 9.673884745094253e-07, + "loss": 0.4699, + "step": 11125 + }, + { + "epoch": 0.86, + "grad_norm": 1.175447841277305, + "learning_rate": 9.66310594513955e-07, + "loss": 0.4949, + "step": 11126 + }, + { + "epoch": 0.86, + "grad_norm": 1.231113180313904, + "learning_rate": 9.652332848619027e-07, + "loss": 0.5281, + "step": 11127 + }, + { + "epoch": 0.86, + "grad_norm": 1.2594876241375021, + "learning_rate": 9.641565456212864e-07, + "loss": 0.4885, + "step": 11128 + }, + { + "epoch": 0.86, + "grad_norm": 1.2586171674296198, + "learning_rate": 9.63080376860086e-07, + "loss": 0.4656, + "step": 11129 + }, + { + "epoch": 0.86, + "grad_norm": 1.2159352198461497, + "learning_rate": 9.620047786462461e-07, + "loss": 0.4531, + "step": 11130 + }, + { + "epoch": 0.86, + "grad_norm": 1.1642756426974885, + "learning_rate": 9.609297510476767e-07, + "loss": 0.4431, + "step": 11131 + }, + { + "epoch": 0.86, + "grad_norm": 1.1896148721997954, + "learning_rate": 9.59855294132247e-07, + "loss": 0.4788, + "step": 11132 + }, + { + "epoch": 0.86, + "grad_norm": 1.0920543605876847, + "learning_rate": 9.587814079677915e-07, + "loss": 0.4649, + "step": 11133 + }, + { + "epoch": 0.86, + "grad_norm": 1.2793039103307267, + "learning_rate": 9.577080926221127e-07, + "loss": 0.5307, + "step": 11134 + }, + { + "epoch": 0.86, + "grad_norm": 1.2251348265584732, + "learning_rate": 9.566353481629742e-07, + "loss": 0.4814, + "step": 11135 + }, + { + "epoch": 0.86, + "grad_norm": 1.0523110637388777, + "learning_rate": 9.55563174658105e-07, + "loss": 0.4391, + "step": 11136 + }, + { + "epoch": 0.86, + "grad_norm": 1.2141699685702245, + "learning_rate": 9.544915721751946e-07, + "loss": 0.5207, + "step": 11137 + }, + { + "epoch": 0.86, + "grad_norm": 1.1176559959783576, + "learning_rate": 9.534205407818997e-07, + "loss": 0.5235, + "step": 11138 + }, + { + "epoch": 0.86, + "grad_norm": 1.3219456701123153, + "learning_rate": 9.523500805458408e-07, + "loss": 0.5502, + "step": 11139 + }, + { + "epoch": 0.86, + "grad_norm": 1.1973584685179508, + "learning_rate": 9.512801915346004e-07, + "loss": 0.5235, + "step": 11140 + }, + { + "epoch": 0.86, + "grad_norm": 1.2014360338980115, + "learning_rate": 9.502108738157279e-07, + "loss": 0.4954, + "step": 11141 + }, + { + "epoch": 0.86, + "grad_norm": 1.2806681265394901, + "learning_rate": 9.491421274567348e-07, + "loss": 0.5363, + "step": 11142 + }, + { + "epoch": 0.86, + "grad_norm": 1.293565410769431, + "learning_rate": 9.480739525250938e-07, + "loss": 0.4993, + "step": 11143 + }, + { + "epoch": 0.86, + "grad_norm": 1.213010920212061, + "learning_rate": 9.470063490882453e-07, + "loss": 0.4709, + "step": 11144 + }, + { + "epoch": 0.86, + "grad_norm": 1.332143202744742, + "learning_rate": 9.459393172135934e-07, + "loss": 0.5443, + "step": 11145 + }, + { + "epoch": 0.86, + "grad_norm": 1.1991046764282272, + "learning_rate": 9.448728569685073e-07, + "loss": 0.4954, + "step": 11146 + }, + { + "epoch": 0.86, + "grad_norm": 1.1676084032724046, + "learning_rate": 9.438069684203144e-07, + "loss": 0.4936, + "step": 11147 + }, + { + "epoch": 0.86, + "grad_norm": 1.3055712967232531, + "learning_rate": 9.427416516363108e-07, + "loss": 0.5068, + "step": 11148 + }, + { + "epoch": 0.86, + "grad_norm": 1.1371821347136535, + "learning_rate": 9.416769066837561e-07, + "loss": 0.527, + "step": 11149 + }, + { + "epoch": 0.86, + "grad_norm": 1.213476312975917, + "learning_rate": 9.406127336298731e-07, + "loss": 0.5763, + "step": 11150 + }, + { + "epoch": 0.87, + "grad_norm": 1.1345408967247246, + "learning_rate": 9.395491325418505e-07, + "loss": 0.4662, + "step": 11151 + }, + { + "epoch": 0.87, + "grad_norm": 1.124287485615347, + "learning_rate": 9.384861034868376e-07, + "loss": 0.4687, + "step": 11152 + }, + { + "epoch": 0.87, + "grad_norm": 1.0808653902659178, + "learning_rate": 9.374236465319453e-07, + "loss": 0.4809, + "step": 11153 + }, + { + "epoch": 0.87, + "grad_norm": 1.186661725560806, + "learning_rate": 9.363617617442555e-07, + "loss": 0.453, + "step": 11154 + }, + { + "epoch": 0.87, + "grad_norm": 1.1402385004589048, + "learning_rate": 9.353004491908102e-07, + "loss": 0.4664, + "step": 11155 + }, + { + "epoch": 0.87, + "grad_norm": 1.204701072527355, + "learning_rate": 9.342397089386168e-07, + "loss": 0.4924, + "step": 11156 + }, + { + "epoch": 0.87, + "grad_norm": 1.1756724056925454, + "learning_rate": 9.331795410546418e-07, + "loss": 0.4596, + "step": 11157 + }, + { + "epoch": 0.87, + "grad_norm": 1.2837324052475951, + "learning_rate": 9.321199456058205e-07, + "loss": 0.504, + "step": 11158 + }, + { + "epoch": 0.87, + "grad_norm": 1.1566139370133486, + "learning_rate": 9.310609226590516e-07, + "loss": 0.5008, + "step": 11159 + }, + { + "epoch": 0.87, + "grad_norm": 1.1717083621756874, + "learning_rate": 9.300024722811973e-07, + "loss": 0.5294, + "step": 11160 + }, + { + "epoch": 0.87, + "grad_norm": 1.1600006718469187, + "learning_rate": 9.289445945390829e-07, + "loss": 0.4973, + "step": 11161 + }, + { + "epoch": 0.87, + "grad_norm": 1.264032704458381, + "learning_rate": 9.278872894994962e-07, + "loss": 0.5283, + "step": 11162 + }, + { + "epoch": 0.87, + "grad_norm": 1.167258515553021, + "learning_rate": 9.268305572291892e-07, + "loss": 0.4497, + "step": 11163 + }, + { + "epoch": 0.87, + "grad_norm": 1.2316437938684506, + "learning_rate": 9.257743977948808e-07, + "loss": 0.4425, + "step": 11164 + }, + { + "epoch": 0.87, + "grad_norm": 1.1928585518713626, + "learning_rate": 9.247188112632522e-07, + "loss": 0.5146, + "step": 11165 + }, + { + "epoch": 0.87, + "grad_norm": 1.2915489491790895, + "learning_rate": 9.236637977009466e-07, + "loss": 0.5629, + "step": 11166 + }, + { + "epoch": 0.87, + "grad_norm": 1.1797096806139902, + "learning_rate": 9.226093571745753e-07, + "loss": 0.4874, + "step": 11167 + }, + { + "epoch": 0.87, + "grad_norm": 1.2036613160736935, + "learning_rate": 9.215554897507062e-07, + "loss": 0.5092, + "step": 11168 + }, + { + "epoch": 0.87, + "grad_norm": 1.246713992170833, + "learning_rate": 9.205021954958781e-07, + "loss": 0.4835, + "step": 11169 + }, + { + "epoch": 0.87, + "grad_norm": 1.2867357930373173, + "learning_rate": 9.194494744765902e-07, + "loss": 0.5126, + "step": 11170 + }, + { + "epoch": 0.87, + "grad_norm": 1.396936175481047, + "learning_rate": 9.183973267593083e-07, + "loss": 0.5147, + "step": 11171 + }, + { + "epoch": 0.87, + "grad_norm": 1.2231216276236363, + "learning_rate": 9.173457524104579e-07, + "loss": 0.5107, + "step": 11172 + }, + { + "epoch": 0.87, + "grad_norm": 1.1079604573303279, + "learning_rate": 9.162947514964283e-07, + "loss": 0.4855, + "step": 11173 + }, + { + "epoch": 0.87, + "grad_norm": 1.2296632582566496, + "learning_rate": 9.152443240835774e-07, + "loss": 0.4597, + "step": 11174 + }, + { + "epoch": 0.87, + "grad_norm": 1.1822916050489412, + "learning_rate": 9.141944702382233e-07, + "loss": 0.4939, + "step": 11175 + }, + { + "epoch": 0.87, + "grad_norm": 1.15621984287834, + "learning_rate": 9.131451900266497e-07, + "loss": 0.5091, + "step": 11176 + }, + { + "epoch": 0.87, + "grad_norm": 1.2923908152108374, + "learning_rate": 9.120964835151025e-07, + "loss": 0.5664, + "step": 11177 + }, + { + "epoch": 0.87, + "grad_norm": 1.1683436783909869, + "learning_rate": 9.110483507697909e-07, + "loss": 0.4956, + "step": 11178 + }, + { + "epoch": 0.87, + "grad_norm": 1.143494380465078, + "learning_rate": 9.100007918568898e-07, + "loss": 0.459, + "step": 11179 + }, + { + "epoch": 0.87, + "grad_norm": 1.2018140611360189, + "learning_rate": 9.089538068425375e-07, + "loss": 0.5266, + "step": 11180 + }, + { + "epoch": 0.87, + "grad_norm": 1.1874766598465667, + "learning_rate": 9.079073957928353e-07, + "loss": 0.4988, + "step": 11181 + }, + { + "epoch": 0.87, + "grad_norm": 1.208609330237781, + "learning_rate": 9.068615587738495e-07, + "loss": 0.5083, + "step": 11182 + }, + { + "epoch": 0.87, + "grad_norm": 1.2322878999026405, + "learning_rate": 9.058162958516059e-07, + "loss": 0.4758, + "step": 11183 + }, + { + "epoch": 0.87, + "grad_norm": 1.2208875348988535, + "learning_rate": 9.047716070920987e-07, + "loss": 0.4875, + "step": 11184 + }, + { + "epoch": 0.87, + "grad_norm": 1.2170034756700947, + "learning_rate": 9.037274925612849e-07, + "loss": 0.5116, + "step": 11185 + }, + { + "epoch": 0.87, + "grad_norm": 1.2934713164851488, + "learning_rate": 9.026839523250863e-07, + "loss": 0.5082, + "step": 11186 + }, + { + "epoch": 0.87, + "grad_norm": 1.3123901866205288, + "learning_rate": 9.016409864493869e-07, + "loss": 0.545, + "step": 11187 + }, + { + "epoch": 0.87, + "grad_norm": 1.26818132669297, + "learning_rate": 9.005985950000318e-07, + "loss": 0.5059, + "step": 11188 + }, + { + "epoch": 0.87, + "grad_norm": 1.1834738390520687, + "learning_rate": 8.995567780428338e-07, + "loss": 0.4754, + "step": 11189 + }, + { + "epoch": 0.87, + "grad_norm": 1.1995655266695722, + "learning_rate": 8.985155356435704e-07, + "loss": 0.5217, + "step": 11190 + }, + { + "epoch": 0.87, + "grad_norm": 1.2144283062695491, + "learning_rate": 8.974748678679768e-07, + "loss": 0.4439, + "step": 11191 + }, + { + "epoch": 0.87, + "grad_norm": 1.1772206769277709, + "learning_rate": 8.964347747817603e-07, + "loss": 0.529, + "step": 11192 + }, + { + "epoch": 0.87, + "grad_norm": 1.199213729997, + "learning_rate": 8.953952564505819e-07, + "loss": 0.4795, + "step": 11193 + }, + { + "epoch": 0.87, + "grad_norm": 1.1775321723608596, + "learning_rate": 8.943563129400756e-07, + "loss": 0.4613, + "step": 11194 + }, + { + "epoch": 0.87, + "grad_norm": 1.1814972089007225, + "learning_rate": 8.933179443158336e-07, + "loss": 0.5584, + "step": 11195 + }, + { + "epoch": 0.87, + "grad_norm": 1.2351378003358497, + "learning_rate": 8.922801506434131e-07, + "loss": 0.5025, + "step": 11196 + }, + { + "epoch": 0.87, + "grad_norm": 1.3081507601983309, + "learning_rate": 8.912429319883398e-07, + "loss": 0.4902, + "step": 11197 + }, + { + "epoch": 0.87, + "grad_norm": 1.1408358862332597, + "learning_rate": 8.902062884160922e-07, + "loss": 0.4499, + "step": 11198 + }, + { + "epoch": 0.87, + "grad_norm": 1.1992552313707423, + "learning_rate": 8.891702199921226e-07, + "loss": 0.4653, + "step": 11199 + }, + { + "epoch": 0.87, + "grad_norm": 1.2486846679154369, + "learning_rate": 8.881347267818441e-07, + "loss": 0.4959, + "step": 11200 + }, + { + "epoch": 0.87, + "grad_norm": 1.136180055267908, + "learning_rate": 8.8709980885063e-07, + "loss": 0.4721, + "step": 11201 + }, + { + "epoch": 0.87, + "grad_norm": 1.2635046068476634, + "learning_rate": 8.860654662638235e-07, + "loss": 0.5093, + "step": 11202 + }, + { + "epoch": 0.87, + "grad_norm": 1.206876627576729, + "learning_rate": 8.850316990867236e-07, + "loss": 0.4926, + "step": 11203 + }, + { + "epoch": 0.87, + "grad_norm": 1.2964802796246564, + "learning_rate": 8.839985073845991e-07, + "loss": 0.5135, + "step": 11204 + }, + { + "epoch": 0.87, + "grad_norm": 1.2375531387479795, + "learning_rate": 8.829658912226813e-07, + "loss": 0.5201, + "step": 11205 + }, + { + "epoch": 0.87, + "grad_norm": 1.1901414505052301, + "learning_rate": 8.819338506661646e-07, + "loss": 0.4904, + "step": 11206 + }, + { + "epoch": 0.87, + "grad_norm": 1.2286828538277166, + "learning_rate": 8.80902385780209e-07, + "loss": 0.4874, + "step": 11207 + }, + { + "epoch": 0.87, + "grad_norm": 1.3429025594877946, + "learning_rate": 8.798714966299327e-07, + "loss": 0.5427, + "step": 11208 + }, + { + "epoch": 0.87, + "grad_norm": 1.1468429394640147, + "learning_rate": 8.788411832804223e-07, + "loss": 0.5172, + "step": 11209 + }, + { + "epoch": 0.87, + "grad_norm": 1.304672697976943, + "learning_rate": 8.77811445796728e-07, + "loss": 0.501, + "step": 11210 + }, + { + "epoch": 0.87, + "grad_norm": 1.1765707919979842, + "learning_rate": 8.767822842438601e-07, + "loss": 0.4648, + "step": 11211 + }, + { + "epoch": 0.87, + "grad_norm": 1.1916922304216908, + "learning_rate": 8.757536986867987e-07, + "loss": 0.5106, + "step": 11212 + }, + { + "epoch": 0.87, + "grad_norm": 1.1705712250162306, + "learning_rate": 8.747256891904787e-07, + "loss": 0.4985, + "step": 11213 + }, + { + "epoch": 0.87, + "grad_norm": 1.2584556214908422, + "learning_rate": 8.736982558198059e-07, + "loss": 0.4938, + "step": 11214 + }, + { + "epoch": 0.87, + "grad_norm": 1.1309118176358817, + "learning_rate": 8.726713986396484e-07, + "loss": 0.4599, + "step": 11215 + }, + { + "epoch": 0.87, + "grad_norm": 1.2241311242172317, + "learning_rate": 8.716451177148355e-07, + "loss": 0.4867, + "step": 11216 + }, + { + "epoch": 0.87, + "grad_norm": 1.108570451874717, + "learning_rate": 8.706194131101653e-07, + "loss": 0.4942, + "step": 11217 + }, + { + "epoch": 0.87, + "grad_norm": 1.2683000433801723, + "learning_rate": 8.695942848903905e-07, + "loss": 0.5085, + "step": 11218 + }, + { + "epoch": 0.87, + "grad_norm": 1.168563844267776, + "learning_rate": 8.685697331202348e-07, + "loss": 0.4712, + "step": 11219 + }, + { + "epoch": 0.87, + "grad_norm": 1.1786564748803117, + "learning_rate": 8.675457578643865e-07, + "loss": 0.4629, + "step": 11220 + }, + { + "epoch": 0.87, + "grad_norm": 1.2282432169230988, + "learning_rate": 8.665223591874894e-07, + "loss": 0.5262, + "step": 11221 + }, + { + "epoch": 0.87, + "grad_norm": 1.1806366493696432, + "learning_rate": 8.654995371541585e-07, + "loss": 0.5021, + "step": 11222 + }, + { + "epoch": 0.87, + "grad_norm": 1.2780642562645959, + "learning_rate": 8.644772918289723e-07, + "loss": 0.5358, + "step": 11223 + }, + { + "epoch": 0.87, + "grad_norm": 1.2631192303374286, + "learning_rate": 8.634556232764646e-07, + "loss": 0.5371, + "step": 11224 + }, + { + "epoch": 0.87, + "grad_norm": 1.2665903624757802, + "learning_rate": 8.624345315611427e-07, + "loss": 0.5115, + "step": 11225 + }, + { + "epoch": 0.87, + "grad_norm": 1.2488369776436161, + "learning_rate": 8.614140167474716e-07, + "loss": 0.5275, + "step": 11226 + }, + { + "epoch": 0.87, + "grad_norm": 1.2711884935141544, + "learning_rate": 8.603940788998832e-07, + "loss": 0.5312, + "step": 11227 + }, + { + "epoch": 0.87, + "grad_norm": 1.2271019302193817, + "learning_rate": 8.593747180827728e-07, + "loss": 0.5376, + "step": 11228 + }, + { + "epoch": 0.87, + "grad_norm": 1.2257255079579459, + "learning_rate": 8.58355934360493e-07, + "loss": 0.536, + "step": 11229 + }, + { + "epoch": 0.87, + "grad_norm": 1.148538598002825, + "learning_rate": 8.573377277973704e-07, + "loss": 0.451, + "step": 11230 + }, + { + "epoch": 0.87, + "grad_norm": 1.2276454930989134, + "learning_rate": 8.563200984576847e-07, + "loss": 0.525, + "step": 11231 + }, + { + "epoch": 0.87, + "grad_norm": 1.1447443845019263, + "learning_rate": 8.553030464056867e-07, + "loss": 0.4689, + "step": 11232 + }, + { + "epoch": 0.87, + "grad_norm": 1.1342899553350756, + "learning_rate": 8.542865717055904e-07, + "loss": 0.485, + "step": 11233 + }, + { + "epoch": 0.87, + "grad_norm": 1.17823768097095, + "learning_rate": 8.532706744215657e-07, + "loss": 0.5005, + "step": 11234 + }, + { + "epoch": 0.87, + "grad_norm": 1.26999996042627, + "learning_rate": 8.522553546177536e-07, + "loss": 0.506, + "step": 11235 + }, + { + "epoch": 0.87, + "grad_norm": 1.209582204096564, + "learning_rate": 8.512406123582583e-07, + "loss": 0.4593, + "step": 11236 + }, + { + "epoch": 0.87, + "grad_norm": 1.2588033623545312, + "learning_rate": 8.502264477071442e-07, + "loss": 0.538, + "step": 11237 + }, + { + "epoch": 0.87, + "grad_norm": 1.3303412402657289, + "learning_rate": 8.492128607284434e-07, + "loss": 0.5367, + "step": 11238 + }, + { + "epoch": 0.87, + "grad_norm": 1.206029630701883, + "learning_rate": 8.481998514861434e-07, + "loss": 0.4401, + "step": 11239 + }, + { + "epoch": 0.87, + "grad_norm": 1.0809843321307926, + "learning_rate": 8.471874200442066e-07, + "loss": 0.4749, + "step": 11240 + }, + { + "epoch": 0.87, + "grad_norm": 1.2413176363892566, + "learning_rate": 8.461755664665483e-07, + "loss": 0.524, + "step": 11241 + }, + { + "epoch": 0.87, + "grad_norm": 1.144646752684332, + "learning_rate": 8.451642908170544e-07, + "loss": 0.4626, + "step": 11242 + }, + { + "epoch": 0.87, + "grad_norm": 1.156760077474445, + "learning_rate": 8.441535931595735e-07, + "loss": 0.5196, + "step": 11243 + }, + { + "epoch": 0.87, + "grad_norm": 1.30150829078651, + "learning_rate": 8.431434735579113e-07, + "loss": 0.5514, + "step": 11244 + }, + { + "epoch": 0.87, + "grad_norm": 1.2666890416344163, + "learning_rate": 8.421339320758459e-07, + "loss": 0.5174, + "step": 11245 + }, + { + "epoch": 0.87, + "grad_norm": 1.1652637914051374, + "learning_rate": 8.411249687771128e-07, + "loss": 0.4734, + "step": 11246 + }, + { + "epoch": 0.87, + "grad_norm": 1.2255565138244648, + "learning_rate": 8.401165837254144e-07, + "loss": 0.4757, + "step": 11247 + }, + { + "epoch": 0.87, + "grad_norm": 1.1574821608386052, + "learning_rate": 8.391087769844164e-07, + "loss": 0.4636, + "step": 11248 + }, + { + "epoch": 0.87, + "grad_norm": 1.239983192299173, + "learning_rate": 8.381015486177446e-07, + "loss": 0.5103, + "step": 11249 + }, + { + "epoch": 0.87, + "grad_norm": 1.1932116727685784, + "learning_rate": 8.370948986889915e-07, + "loss": 0.4919, + "step": 11250 + }, + { + "epoch": 0.87, + "grad_norm": 1.1838144027447353, + "learning_rate": 8.360888272617107e-07, + "loss": 0.5026, + "step": 11251 + }, + { + "epoch": 0.87, + "grad_norm": 1.6026362099686522, + "learning_rate": 8.350833343994225e-07, + "loss": 0.4943, + "step": 11252 + }, + { + "epoch": 0.87, + "grad_norm": 1.2231686039609668, + "learning_rate": 8.340784201656094e-07, + "loss": 0.5212, + "step": 11253 + }, + { + "epoch": 0.87, + "grad_norm": 1.226911167227712, + "learning_rate": 8.330740846237128e-07, + "loss": 0.4926, + "step": 11254 + }, + { + "epoch": 0.87, + "grad_norm": 1.3333723638702852, + "learning_rate": 8.320703278371456e-07, + "loss": 0.5012, + "step": 11255 + }, + { + "epoch": 0.87, + "grad_norm": 1.1544069086032005, + "learning_rate": 8.31067149869279e-07, + "loss": 0.4843, + "step": 11256 + }, + { + "epoch": 0.87, + "grad_norm": 1.2797548828270313, + "learning_rate": 8.300645507834481e-07, + "loss": 0.5283, + "step": 11257 + }, + { + "epoch": 0.87, + "grad_norm": 1.1862572642225533, + "learning_rate": 8.290625306429545e-07, + "loss": 0.4402, + "step": 11258 + }, + { + "epoch": 0.87, + "grad_norm": 1.2463512573123212, + "learning_rate": 8.280610895110575e-07, + "loss": 0.5039, + "step": 11259 + }, + { + "epoch": 0.87, + "grad_norm": 1.2776910152326302, + "learning_rate": 8.270602274509864e-07, + "loss": 0.5541, + "step": 11260 + }, + { + "epoch": 0.87, + "grad_norm": 1.2480170734411393, + "learning_rate": 8.260599445259276e-07, + "loss": 0.5114, + "step": 11261 + }, + { + "epoch": 0.87, + "grad_norm": 1.196292500393657, + "learning_rate": 8.250602407990361e-07, + "loss": 0.4747, + "step": 11262 + }, + { + "epoch": 0.87, + "grad_norm": 1.1423940657036784, + "learning_rate": 8.2406111633343e-07, + "loss": 0.4607, + "step": 11263 + }, + { + "epoch": 0.87, + "grad_norm": 1.1827345143338315, + "learning_rate": 8.230625711921858e-07, + "loss": 0.5077, + "step": 11264 + }, + { + "epoch": 0.87, + "grad_norm": 1.3033689128835662, + "learning_rate": 8.220646054383475e-07, + "loss": 0.5258, + "step": 11265 + }, + { + "epoch": 0.87, + "grad_norm": 1.171313291079321, + "learning_rate": 8.210672191349222e-07, + "loss": 0.466, + "step": 11266 + }, + { + "epoch": 0.87, + "grad_norm": 1.1957569168787214, + "learning_rate": 8.20070412344881e-07, + "loss": 0.4996, + "step": 11267 + }, + { + "epoch": 0.87, + "grad_norm": 1.2988797514557535, + "learning_rate": 8.19074185131159e-07, + "loss": 0.5458, + "step": 11268 + }, + { + "epoch": 0.87, + "grad_norm": 1.1304129607189908, + "learning_rate": 8.180785375566491e-07, + "loss": 0.4761, + "step": 11269 + }, + { + "epoch": 0.87, + "grad_norm": 1.1488017847387322, + "learning_rate": 8.170834696842156e-07, + "loss": 0.4788, + "step": 11270 + }, + { + "epoch": 0.87, + "grad_norm": 1.1078762084548042, + "learning_rate": 8.160889815766782e-07, + "loss": 0.4699, + "step": 11271 + }, + { + "epoch": 0.87, + "grad_norm": 1.1308643821908522, + "learning_rate": 8.150950732968255e-07, + "loss": 0.4765, + "step": 11272 + }, + { + "epoch": 0.87, + "grad_norm": 1.298289756769988, + "learning_rate": 8.141017449074096e-07, + "loss": 0.497, + "step": 11273 + }, + { + "epoch": 0.87, + "grad_norm": 1.2390328421777432, + "learning_rate": 8.131089964711447e-07, + "loss": 0.4963, + "step": 11274 + }, + { + "epoch": 0.87, + "grad_norm": 1.1554863444156502, + "learning_rate": 8.121168280507053e-07, + "loss": 0.507, + "step": 11275 + }, + { + "epoch": 0.87, + "grad_norm": 1.170073994473048, + "learning_rate": 8.111252397087344e-07, + "loss": 0.5472, + "step": 11276 + }, + { + "epoch": 0.87, + "grad_norm": 1.2045351162134736, + "learning_rate": 8.101342315078342e-07, + "loss": 0.5229, + "step": 11277 + }, + { + "epoch": 0.87, + "grad_norm": 1.2169387269342302, + "learning_rate": 8.091438035105747e-07, + "loss": 0.4822, + "step": 11278 + }, + { + "epoch": 0.88, + "grad_norm": 1.243795733353835, + "learning_rate": 8.08153955779487e-07, + "loss": 0.5003, + "step": 11279 + }, + { + "epoch": 0.88, + "grad_norm": 1.1594836921844616, + "learning_rate": 8.071646883770634e-07, + "loss": 0.4696, + "step": 11280 + }, + { + "epoch": 0.88, + "grad_norm": 1.2186486617774832, + "learning_rate": 8.061760013657605e-07, + "loss": 0.5129, + "step": 11281 + }, + { + "epoch": 0.88, + "grad_norm": 1.1750611289335768, + "learning_rate": 8.051878948080006e-07, + "loss": 0.5128, + "step": 11282 + }, + { + "epoch": 0.88, + "grad_norm": 1.2175840033155843, + "learning_rate": 8.042003687661671e-07, + "loss": 0.5118, + "step": 11283 + }, + { + "epoch": 0.88, + "grad_norm": 1.33371138676787, + "learning_rate": 8.032134233026101e-07, + "loss": 0.5344, + "step": 11284 + }, + { + "epoch": 0.88, + "grad_norm": 1.1675892088827573, + "learning_rate": 8.022270584796376e-07, + "loss": 0.4784, + "step": 11285 + }, + { + "epoch": 0.88, + "grad_norm": 1.1941431846004729, + "learning_rate": 8.012412743595255e-07, + "loss": 0.5054, + "step": 11286 + }, + { + "epoch": 0.88, + "grad_norm": 1.2484860311726615, + "learning_rate": 8.002560710045115e-07, + "loss": 0.5308, + "step": 11287 + }, + { + "epoch": 0.88, + "grad_norm": 1.2217283292580965, + "learning_rate": 7.992714484767949e-07, + "loss": 0.5337, + "step": 11288 + }, + { + "epoch": 0.88, + "grad_norm": 1.147829634499345, + "learning_rate": 7.982874068385438e-07, + "loss": 0.4634, + "step": 11289 + }, + { + "epoch": 0.88, + "grad_norm": 1.218583706979347, + "learning_rate": 7.973039461518827e-07, + "loss": 0.4937, + "step": 11290 + }, + { + "epoch": 0.88, + "grad_norm": 1.302723302608259, + "learning_rate": 7.963210664789022e-07, + "loss": 0.5007, + "step": 11291 + }, + { + "epoch": 0.88, + "grad_norm": 1.1091457721128348, + "learning_rate": 7.953387678816571e-07, + "loss": 0.4481, + "step": 11292 + }, + { + "epoch": 0.88, + "grad_norm": 1.177266700213032, + "learning_rate": 7.943570504221654e-07, + "loss": 0.4766, + "step": 11293 + }, + { + "epoch": 0.88, + "grad_norm": 1.0073171773452696, + "learning_rate": 7.933759141624098e-07, + "loss": 0.4559, + "step": 11294 + }, + { + "epoch": 0.88, + "grad_norm": 1.1714233036537556, + "learning_rate": 7.923953591643308e-07, + "loss": 0.4675, + "step": 11295 + }, + { + "epoch": 0.88, + "grad_norm": 1.1046513207539383, + "learning_rate": 7.914153854898376e-07, + "loss": 0.4606, + "step": 11296 + }, + { + "epoch": 0.88, + "grad_norm": 1.1417194498116632, + "learning_rate": 7.90435993200801e-07, + "loss": 0.4495, + "step": 11297 + }, + { + "epoch": 0.88, + "grad_norm": 1.271493422421848, + "learning_rate": 7.89457182359058e-07, + "loss": 0.516, + "step": 11298 + }, + { + "epoch": 0.88, + "grad_norm": 1.261224750383257, + "learning_rate": 7.884789530264004e-07, + "loss": 0.507, + "step": 11299 + }, + { + "epoch": 0.88, + "grad_norm": 1.2302893613810355, + "learning_rate": 7.875013052645941e-07, + "loss": 0.5111, + "step": 11300 + }, + { + "epoch": 0.88, + "grad_norm": 1.0920654403305488, + "learning_rate": 7.865242391353589e-07, + "loss": 0.4337, + "step": 11301 + }, + { + "epoch": 0.88, + "grad_norm": 1.0906805838600118, + "learning_rate": 7.855477547003831e-07, + "loss": 0.4428, + "step": 11302 + }, + { + "epoch": 0.88, + "grad_norm": 1.2393748751903324, + "learning_rate": 7.845718520213186e-07, + "loss": 0.5743, + "step": 11303 + }, + { + "epoch": 0.88, + "grad_norm": 1.1909548044861884, + "learning_rate": 7.835965311597804e-07, + "loss": 0.4726, + "step": 11304 + }, + { + "epoch": 0.88, + "grad_norm": 1.138015269855835, + "learning_rate": 7.826217921773416e-07, + "loss": 0.4813, + "step": 11305 + }, + { + "epoch": 0.88, + "grad_norm": 1.2169900070413333, + "learning_rate": 7.81647635135544e-07, + "loss": 0.501, + "step": 11306 + }, + { + "epoch": 0.88, + "grad_norm": 1.3549569782592175, + "learning_rate": 7.806740600958918e-07, + "loss": 0.5692, + "step": 11307 + }, + { + "epoch": 0.88, + "grad_norm": 1.1757004415428356, + "learning_rate": 7.797010671198534e-07, + "loss": 0.4679, + "step": 11308 + }, + { + "epoch": 0.88, + "grad_norm": 1.242894433351586, + "learning_rate": 7.787286562688556e-07, + "loss": 0.4583, + "step": 11309 + }, + { + "epoch": 0.88, + "grad_norm": 1.2404504782967962, + "learning_rate": 7.777568276042946e-07, + "loss": 0.5195, + "step": 11310 + }, + { + "epoch": 0.88, + "grad_norm": 1.1420621831903188, + "learning_rate": 7.767855811875236e-07, + "loss": 0.4584, + "step": 11311 + }, + { + "epoch": 0.88, + "grad_norm": 1.3082915455326396, + "learning_rate": 7.758149170798656e-07, + "loss": 0.5237, + "step": 11312 + }, + { + "epoch": 0.88, + "grad_norm": 1.1417436731546178, + "learning_rate": 7.748448353426019e-07, + "loss": 0.4933, + "step": 11313 + }, + { + "epoch": 0.88, + "grad_norm": 1.2384882131238193, + "learning_rate": 7.7387533603698e-07, + "loss": 0.5721, + "step": 11314 + }, + { + "epoch": 0.88, + "grad_norm": 1.2964811531341784, + "learning_rate": 7.729064192242075e-07, + "loss": 0.5366, + "step": 11315 + }, + { + "epoch": 0.88, + "grad_norm": 1.1619825277902553, + "learning_rate": 7.71938084965459e-07, + "loss": 0.4861, + "step": 11316 + }, + { + "epoch": 0.88, + "grad_norm": 1.1922181911692673, + "learning_rate": 7.709703333218698e-07, + "loss": 0.498, + "step": 11317 + }, + { + "epoch": 0.88, + "grad_norm": 1.2431795489357234, + "learning_rate": 7.700031643545402e-07, + "loss": 0.5175, + "step": 11318 + }, + { + "epoch": 0.88, + "grad_norm": 1.2842321348902581, + "learning_rate": 7.690365781245291e-07, + "loss": 0.532, + "step": 11319 + }, + { + "epoch": 0.88, + "grad_norm": 1.1670132814679615, + "learning_rate": 7.680705746928663e-07, + "loss": 0.485, + "step": 11320 + }, + { + "epoch": 0.88, + "grad_norm": 1.1312991326553257, + "learning_rate": 7.671051541205376e-07, + "loss": 0.482, + "step": 11321 + }, + { + "epoch": 0.88, + "grad_norm": 1.234337818213493, + "learning_rate": 7.661403164684955e-07, + "loss": 0.5314, + "step": 11322 + }, + { + "epoch": 0.88, + "grad_norm": 1.0515430057526833, + "learning_rate": 7.651760617976556e-07, + "loss": 0.4569, + "step": 11323 + }, + { + "epoch": 0.88, + "grad_norm": 1.062439187497396, + "learning_rate": 7.64212390168898e-07, + "loss": 0.4803, + "step": 11324 + }, + { + "epoch": 0.88, + "grad_norm": 1.1316083624984197, + "learning_rate": 7.63249301643062e-07, + "loss": 0.4445, + "step": 11325 + }, + { + "epoch": 0.88, + "grad_norm": 1.0981880610189232, + "learning_rate": 7.622867962809521e-07, + "loss": 0.4418, + "step": 11326 + }, + { + "epoch": 0.88, + "grad_norm": 1.4834911726794078, + "learning_rate": 7.613248741433365e-07, + "loss": 0.5421, + "step": 11327 + }, + { + "epoch": 0.88, + "grad_norm": 1.297139519980659, + "learning_rate": 7.603635352909489e-07, + "loss": 0.5212, + "step": 11328 + }, + { + "epoch": 0.88, + "grad_norm": 1.2572005779620647, + "learning_rate": 7.594027797844805e-07, + "loss": 0.5428, + "step": 11329 + }, + { + "epoch": 0.88, + "grad_norm": 1.1773678033570112, + "learning_rate": 7.584426076845908e-07, + "loss": 0.5424, + "step": 11330 + }, + { + "epoch": 0.88, + "grad_norm": 1.211774475272321, + "learning_rate": 7.574830190518978e-07, + "loss": 0.5016, + "step": 11331 + }, + { + "epoch": 0.88, + "grad_norm": 1.1955396149731525, + "learning_rate": 7.565240139469877e-07, + "loss": 0.5345, + "step": 11332 + }, + { + "epoch": 0.88, + "grad_norm": 1.1980447894147563, + "learning_rate": 7.555655924304062e-07, + "loss": 0.5159, + "step": 11333 + }, + { + "epoch": 0.88, + "grad_norm": 1.16401743961654, + "learning_rate": 7.54607754562664e-07, + "loss": 0.5148, + "step": 11334 + }, + { + "epoch": 0.88, + "grad_norm": 1.2553314000920262, + "learning_rate": 7.536505004042361e-07, + "loss": 0.5302, + "step": 11335 + }, + { + "epoch": 0.88, + "grad_norm": 1.2720684813502816, + "learning_rate": 7.526938300155539e-07, + "loss": 0.5237, + "step": 11336 + }, + { + "epoch": 0.88, + "grad_norm": 1.1479334860583175, + "learning_rate": 7.517377434570217e-07, + "loss": 0.4627, + "step": 11337 + }, + { + "epoch": 0.88, + "grad_norm": 1.2516650078140692, + "learning_rate": 7.50782240789002e-07, + "loss": 0.4981, + "step": 11338 + }, + { + "epoch": 0.88, + "grad_norm": 1.2217910191390557, + "learning_rate": 7.498273220718167e-07, + "loss": 0.5245, + "step": 11339 + }, + { + "epoch": 0.88, + "grad_norm": 1.290836708494584, + "learning_rate": 7.488729873657586e-07, + "loss": 0.5503, + "step": 11340 + }, + { + "epoch": 0.88, + "grad_norm": 1.343059761731569, + "learning_rate": 7.479192367310773e-07, + "loss": 0.4808, + "step": 11341 + }, + { + "epoch": 0.88, + "grad_norm": 1.221843266379924, + "learning_rate": 7.46966070227989e-07, + "loss": 0.4829, + "step": 11342 + }, + { + "epoch": 0.88, + "grad_norm": 1.2567819199887884, + "learning_rate": 7.460134879166725e-07, + "loss": 0.5545, + "step": 11343 + }, + { + "epoch": 0.88, + "grad_norm": 1.1358596334306892, + "learning_rate": 7.450614898572683e-07, + "loss": 0.5319, + "step": 11344 + }, + { + "epoch": 0.88, + "grad_norm": 1.2651083562830525, + "learning_rate": 7.44110076109883e-07, + "loss": 0.4735, + "step": 11345 + }, + { + "epoch": 0.88, + "grad_norm": 1.1864908849701237, + "learning_rate": 7.431592467345816e-07, + "loss": 0.4745, + "step": 11346 + }, + { + "epoch": 0.88, + "grad_norm": 1.2582502373464548, + "learning_rate": 7.422090017913952e-07, + "loss": 0.5295, + "step": 11347 + }, + { + "epoch": 0.88, + "grad_norm": 1.2223549858629155, + "learning_rate": 7.412593413403202e-07, + "loss": 0.4948, + "step": 11348 + }, + { + "epoch": 0.88, + "grad_norm": 1.2842134304472528, + "learning_rate": 7.403102654413108e-07, + "loss": 0.5468, + "step": 11349 + }, + { + "epoch": 0.88, + "grad_norm": 1.1531296975471728, + "learning_rate": 7.393617741542891e-07, + "loss": 0.4962, + "step": 11350 + }, + { + "epoch": 0.88, + "grad_norm": 1.197141158586891, + "learning_rate": 7.384138675391362e-07, + "loss": 0.434, + "step": 11351 + }, + { + "epoch": 0.88, + "grad_norm": 1.2516980082367521, + "learning_rate": 7.374665456556984e-07, + "loss": 0.519, + "step": 11352 + }, + { + "epoch": 0.88, + "grad_norm": 1.252221089220329, + "learning_rate": 7.365198085637871e-07, + "loss": 0.5258, + "step": 11353 + }, + { + "epoch": 0.88, + "grad_norm": 1.3108531065409659, + "learning_rate": 7.35573656323173e-07, + "loss": 0.4826, + "step": 11354 + }, + { + "epoch": 0.88, + "grad_norm": 1.2326545797910435, + "learning_rate": 7.346280889935931e-07, + "loss": 0.4995, + "step": 11355 + }, + { + "epoch": 0.88, + "grad_norm": 1.1569810051475684, + "learning_rate": 7.336831066347438e-07, + "loss": 0.4898, + "step": 11356 + }, + { + "epoch": 0.88, + "grad_norm": 1.1640434519598215, + "learning_rate": 7.327387093062887e-07, + "loss": 0.4469, + "step": 11357 + }, + { + "epoch": 0.88, + "grad_norm": 1.1720978080017563, + "learning_rate": 7.317948970678524e-07, + "loss": 0.483, + "step": 11358 + }, + { + "epoch": 0.88, + "grad_norm": 1.1839182191777131, + "learning_rate": 7.308516699790202e-07, + "loss": 0.4855, + "step": 11359 + }, + { + "epoch": 0.88, + "grad_norm": 1.253123529295403, + "learning_rate": 7.29909028099347e-07, + "loss": 0.4706, + "step": 11360 + }, + { + "epoch": 0.88, + "grad_norm": 1.188400027867705, + "learning_rate": 7.289669714883419e-07, + "loss": 0.5155, + "step": 11361 + }, + { + "epoch": 0.88, + "grad_norm": 1.222107639263959, + "learning_rate": 7.280255002054848e-07, + "loss": 0.5247, + "step": 11362 + }, + { + "epoch": 0.88, + "grad_norm": 1.186555034703382, + "learning_rate": 7.270846143102139e-07, + "loss": 0.4576, + "step": 11363 + }, + { + "epoch": 0.88, + "grad_norm": 1.2188331869052162, + "learning_rate": 7.26144313861934e-07, + "loss": 0.5334, + "step": 11364 + }, + { + "epoch": 0.88, + "grad_norm": 1.3326202611707463, + "learning_rate": 7.252045989200118e-07, + "loss": 0.5555, + "step": 11365 + }, + { + "epoch": 0.88, + "grad_norm": 1.3235553441280918, + "learning_rate": 7.242654695437734e-07, + "loss": 0.5047, + "step": 11366 + }, + { + "epoch": 0.88, + "grad_norm": 1.1612001596848196, + "learning_rate": 7.233269257925124e-07, + "loss": 0.4436, + "step": 11367 + }, + { + "epoch": 0.88, + "grad_norm": 1.2319523178292437, + "learning_rate": 7.223889677254858e-07, + "loss": 0.4354, + "step": 11368 + }, + { + "epoch": 0.88, + "grad_norm": 1.183256397074468, + "learning_rate": 7.214515954019086e-07, + "loss": 0.5105, + "step": 11369 + }, + { + "epoch": 0.88, + "grad_norm": 1.231004725390614, + "learning_rate": 7.205148088809632e-07, + "loss": 0.476, + "step": 11370 + }, + { + "epoch": 0.88, + "grad_norm": 1.2966763907331895, + "learning_rate": 7.195786082217937e-07, + "loss": 0.5428, + "step": 11371 + }, + { + "epoch": 0.88, + "grad_norm": 1.314128138369065, + "learning_rate": 7.18642993483507e-07, + "loss": 0.5137, + "step": 11372 + }, + { + "epoch": 0.88, + "grad_norm": 1.1548414156155586, + "learning_rate": 7.177079647251728e-07, + "loss": 0.4577, + "step": 11373 + }, + { + "epoch": 0.88, + "grad_norm": 1.254703779103097, + "learning_rate": 7.167735220058258e-07, + "loss": 0.4953, + "step": 11374 + }, + { + "epoch": 0.88, + "grad_norm": 1.0821017025482853, + "learning_rate": 7.158396653844635e-07, + "loss": 0.4574, + "step": 11375 + }, + { + "epoch": 0.88, + "grad_norm": 1.2697560742182812, + "learning_rate": 7.149063949200408e-07, + "loss": 0.4829, + "step": 11376 + }, + { + "epoch": 0.88, + "grad_norm": 1.234020713471559, + "learning_rate": 7.139737106714817e-07, + "loss": 0.4451, + "step": 11377 + }, + { + "epoch": 0.88, + "grad_norm": 1.2116252299853332, + "learning_rate": 7.130416126976747e-07, + "loss": 0.4725, + "step": 11378 + }, + { + "epoch": 0.88, + "grad_norm": 1.2450852573264932, + "learning_rate": 7.121101010574626e-07, + "loss": 0.5025, + "step": 11379 + }, + { + "epoch": 0.88, + "grad_norm": 1.2100191140832603, + "learning_rate": 7.111791758096609e-07, + "loss": 0.4665, + "step": 11380 + }, + { + "epoch": 0.88, + "grad_norm": 1.1004965203423887, + "learning_rate": 7.102488370130411e-07, + "loss": 0.5057, + "step": 11381 + }, + { + "epoch": 0.88, + "grad_norm": 1.15057159192129, + "learning_rate": 7.093190847263398e-07, + "loss": 0.487, + "step": 11382 + }, + { + "epoch": 0.88, + "grad_norm": 1.2147374244599554, + "learning_rate": 7.083899190082588e-07, + "loss": 0.4996, + "step": 11383 + }, + { + "epoch": 0.88, + "grad_norm": 1.2225960421134476, + "learning_rate": 7.074613399174601e-07, + "loss": 0.4965, + "step": 11384 + }, + { + "epoch": 0.88, + "grad_norm": 1.282002809384581, + "learning_rate": 7.065333475125713e-07, + "loss": 0.4787, + "step": 11385 + }, + { + "epoch": 0.88, + "grad_norm": 1.2490485862126859, + "learning_rate": 7.05605941852181e-07, + "loss": 0.544, + "step": 11386 + }, + { + "epoch": 0.88, + "grad_norm": 1.2084481524581705, + "learning_rate": 7.046791229948391e-07, + "loss": 0.46, + "step": 11387 + }, + { + "epoch": 0.88, + "grad_norm": 1.161967754550203, + "learning_rate": 7.037528909990632e-07, + "loss": 0.4995, + "step": 11388 + }, + { + "epoch": 0.88, + "grad_norm": 1.2209253215743834, + "learning_rate": 7.028272459233277e-07, + "loss": 0.5006, + "step": 11389 + }, + { + "epoch": 0.88, + "grad_norm": 1.216686017174321, + "learning_rate": 7.019021878260757e-07, + "loss": 0.5209, + "step": 11390 + }, + { + "epoch": 0.88, + "grad_norm": 1.0741391239132188, + "learning_rate": 7.009777167657117e-07, + "loss": 0.4743, + "step": 11391 + }, + { + "epoch": 0.88, + "grad_norm": 1.2367567913207473, + "learning_rate": 7.000538328006001e-07, + "loss": 0.502, + "step": 11392 + }, + { + "epoch": 0.88, + "grad_norm": 1.2878968488307398, + "learning_rate": 6.99130535989071e-07, + "loss": 0.4929, + "step": 11393 + }, + { + "epoch": 0.88, + "grad_norm": 1.2053002349094968, + "learning_rate": 6.982078263894176e-07, + "loss": 0.52, + "step": 11394 + }, + { + "epoch": 0.88, + "grad_norm": 1.0935505821449385, + "learning_rate": 6.972857040598945e-07, + "loss": 0.4696, + "step": 11395 + }, + { + "epoch": 0.88, + "grad_norm": 1.2263248966687565, + "learning_rate": 6.96364169058722e-07, + "loss": 0.5011, + "step": 11396 + }, + { + "epoch": 0.88, + "grad_norm": 1.3132745183242291, + "learning_rate": 6.954432214440798e-07, + "loss": 0.5406, + "step": 11397 + }, + { + "epoch": 0.88, + "grad_norm": 1.1288686039861384, + "learning_rate": 6.945228612741129e-07, + "loss": 0.4861, + "step": 11398 + }, + { + "epoch": 0.88, + "grad_norm": 1.2181872022081492, + "learning_rate": 6.936030886069256e-07, + "loss": 0.5083, + "step": 11399 + }, + { + "epoch": 0.88, + "grad_norm": 1.1324142282102119, + "learning_rate": 6.926839035005905e-07, + "loss": 0.4333, + "step": 11400 + }, + { + "epoch": 0.88, + "grad_norm": 0.9944093112552583, + "learning_rate": 6.917653060131413e-07, + "loss": 0.4189, + "step": 11401 + }, + { + "epoch": 0.88, + "grad_norm": 1.0954491087959555, + "learning_rate": 6.908472962025714e-07, + "loss": 0.4524, + "step": 11402 + }, + { + "epoch": 0.88, + "grad_norm": 1.415816341715057, + "learning_rate": 6.899298741268412e-07, + "loss": 0.531, + "step": 11403 + }, + { + "epoch": 0.88, + "grad_norm": 1.1697057355579827, + "learning_rate": 6.89013039843871e-07, + "loss": 0.4878, + "step": 11404 + }, + { + "epoch": 0.88, + "grad_norm": 1.2172217446285696, + "learning_rate": 6.880967934115457e-07, + "loss": 0.5402, + "step": 11405 + }, + { + "epoch": 0.88, + "grad_norm": 1.1613533702458352, + "learning_rate": 6.871811348877144e-07, + "loss": 0.5041, + "step": 11406 + }, + { + "epoch": 0.88, + "grad_norm": 1.2884818416554964, + "learning_rate": 6.862660643301855e-07, + "loss": 0.5085, + "step": 11407 + }, + { + "epoch": 0.89, + "grad_norm": 1.0959561033820955, + "learning_rate": 6.853515817967327e-07, + "loss": 0.4653, + "step": 11408 + }, + { + "epoch": 0.89, + "grad_norm": 1.234250074414797, + "learning_rate": 6.844376873450908e-07, + "loss": 0.5013, + "step": 11409 + }, + { + "epoch": 0.89, + "grad_norm": 1.1417181446600595, + "learning_rate": 6.835243810329595e-07, + "loss": 0.5016, + "step": 11410 + }, + { + "epoch": 0.89, + "grad_norm": 1.1355340296199523, + "learning_rate": 6.826116629180024e-07, + "loss": 0.48, + "step": 11411 + }, + { + "epoch": 0.89, + "grad_norm": 1.1304767073197668, + "learning_rate": 6.816995330578413e-07, + "loss": 0.4767, + "step": 11412 + }, + { + "epoch": 0.89, + "grad_norm": 1.3036671834739988, + "learning_rate": 6.807879915100646e-07, + "loss": 0.5729, + "step": 11413 + }, + { + "epoch": 0.89, + "grad_norm": 1.2731069124010923, + "learning_rate": 6.798770383322218e-07, + "loss": 0.5003, + "step": 11414 + }, + { + "epoch": 0.89, + "grad_norm": 1.1115256403832667, + "learning_rate": 6.789666735818279e-07, + "loss": 0.452, + "step": 11415 + }, + { + "epoch": 0.89, + "grad_norm": 1.2920868508050043, + "learning_rate": 6.780568973163604e-07, + "loss": 0.5029, + "step": 11416 + }, + { + "epoch": 0.89, + "grad_norm": 1.0904333782458138, + "learning_rate": 6.771477095932533e-07, + "loss": 0.4334, + "step": 11417 + }, + { + "epoch": 0.89, + "grad_norm": 1.1416562787625226, + "learning_rate": 6.762391104699129e-07, + "loss": 0.4941, + "step": 11418 + }, + { + "epoch": 0.89, + "grad_norm": 1.265655517210204, + "learning_rate": 6.753311000036999e-07, + "loss": 0.54, + "step": 11419 + }, + { + "epoch": 0.89, + "grad_norm": 1.1419123348152156, + "learning_rate": 6.744236782519431e-07, + "loss": 0.5165, + "step": 11420 + }, + { + "epoch": 0.89, + "grad_norm": 1.1049265795692982, + "learning_rate": 6.735168452719354e-07, + "loss": 0.5151, + "step": 11421 + }, + { + "epoch": 0.89, + "grad_norm": 1.2633155662391082, + "learning_rate": 6.726106011209266e-07, + "loss": 0.5072, + "step": 11422 + }, + { + "epoch": 0.89, + "grad_norm": 1.2010426521572934, + "learning_rate": 6.71704945856132e-07, + "loss": 0.5102, + "step": 11423 + }, + { + "epoch": 0.89, + "grad_norm": 1.3141656025503459, + "learning_rate": 6.707998795347326e-07, + "loss": 0.5289, + "step": 11424 + }, + { + "epoch": 0.89, + "grad_norm": 1.2124094722813237, + "learning_rate": 6.698954022138692e-07, + "loss": 0.5087, + "step": 11425 + }, + { + "epoch": 0.89, + "grad_norm": 1.2292741577648272, + "learning_rate": 6.689915139506475e-07, + "loss": 0.5242, + "step": 11426 + }, + { + "epoch": 0.89, + "grad_norm": 1.2817797612335684, + "learning_rate": 6.680882148021318e-07, + "loss": 0.5443, + "step": 11427 + }, + { + "epoch": 0.89, + "grad_norm": 1.2314015561466816, + "learning_rate": 6.67185504825355e-07, + "loss": 0.4935, + "step": 11428 + }, + { + "epoch": 0.89, + "grad_norm": 1.2000672758635287, + "learning_rate": 6.662833840773064e-07, + "loss": 0.4746, + "step": 11429 + }, + { + "epoch": 0.89, + "grad_norm": 1.2415612999588386, + "learning_rate": 6.653818526149436e-07, + "loss": 0.5062, + "step": 11430 + }, + { + "epoch": 0.89, + "grad_norm": 1.2564989422157289, + "learning_rate": 6.644809104951866e-07, + "loss": 0.496, + "step": 11431 + }, + { + "epoch": 0.89, + "grad_norm": 1.169186215924927, + "learning_rate": 6.635805577749133e-07, + "loss": 0.4704, + "step": 11432 + }, + { + "epoch": 0.89, + "grad_norm": 1.2325190341090977, + "learning_rate": 6.626807945109681e-07, + "loss": 0.5077, + "step": 11433 + }, + { + "epoch": 0.89, + "grad_norm": 1.1688490902106825, + "learning_rate": 6.61781620760159e-07, + "loss": 0.5123, + "step": 11434 + }, + { + "epoch": 0.89, + "grad_norm": 1.349198230439379, + "learning_rate": 6.608830365792551e-07, + "loss": 0.5181, + "step": 11435 + }, + { + "epoch": 0.89, + "grad_norm": 1.2908356002892785, + "learning_rate": 6.59985042024991e-07, + "loss": 0.5258, + "step": 11436 + }, + { + "epoch": 0.89, + "grad_norm": 1.3881548685006135, + "learning_rate": 6.590876371540567e-07, + "loss": 0.5603, + "step": 11437 + }, + { + "epoch": 0.89, + "grad_norm": 1.184198961285118, + "learning_rate": 6.581908220231137e-07, + "loss": 0.4849, + "step": 11438 + }, + { + "epoch": 0.89, + "grad_norm": 1.1663623083566974, + "learning_rate": 6.572945966887812e-07, + "loss": 0.4801, + "step": 11439 + }, + { + "epoch": 0.89, + "grad_norm": 1.2436056139538596, + "learning_rate": 6.563989612076416e-07, + "loss": 0.5004, + "step": 11440 + }, + { + "epoch": 0.89, + "grad_norm": 1.2734395126607705, + "learning_rate": 6.55503915636243e-07, + "loss": 0.4826, + "step": 11441 + }, + { + "epoch": 0.89, + "grad_norm": 1.2667629636165563, + "learning_rate": 6.546094600310949e-07, + "loss": 0.5281, + "step": 11442 + }, + { + "epoch": 0.89, + "grad_norm": 1.2414451636234511, + "learning_rate": 6.537155944486662e-07, + "loss": 0.5047, + "step": 11443 + }, + { + "epoch": 0.89, + "grad_norm": 1.306529634833789, + "learning_rate": 6.52822318945392e-07, + "loss": 0.5592, + "step": 11444 + }, + { + "epoch": 0.89, + "grad_norm": 1.172634488996047, + "learning_rate": 6.519296335776703e-07, + "loss": 0.5198, + "step": 11445 + }, + { + "epoch": 0.89, + "grad_norm": 1.2214905175376964, + "learning_rate": 6.510375384018619e-07, + "loss": 0.4983, + "step": 11446 + }, + { + "epoch": 0.89, + "grad_norm": 1.1749153370487948, + "learning_rate": 6.50146033474287e-07, + "loss": 0.473, + "step": 11447 + }, + { + "epoch": 0.89, + "grad_norm": 1.2645459225613036, + "learning_rate": 6.492551188512331e-07, + "loss": 0.5368, + "step": 11448 + }, + { + "epoch": 0.89, + "grad_norm": 1.2899787651946117, + "learning_rate": 6.483647945889449e-07, + "loss": 0.528, + "step": 11449 + }, + { + "epoch": 0.89, + "grad_norm": 1.3149651129703641, + "learning_rate": 6.474750607436364e-07, + "loss": 0.5215, + "step": 11450 + }, + { + "epoch": 0.89, + "grad_norm": 1.1656019540916982, + "learning_rate": 6.465859173714784e-07, + "loss": 0.484, + "step": 11451 + }, + { + "epoch": 0.89, + "grad_norm": 1.3168021492525686, + "learning_rate": 6.456973645286113e-07, + "loss": 0.4718, + "step": 11452 + }, + { + "epoch": 0.89, + "grad_norm": 1.0959935202963917, + "learning_rate": 6.448094022711304e-07, + "loss": 0.4295, + "step": 11453 + }, + { + "epoch": 0.89, + "grad_norm": 1.2052575569794224, + "learning_rate": 6.439220306550975e-07, + "loss": 0.4882, + "step": 11454 + }, + { + "epoch": 0.89, + "grad_norm": 1.2282212819027056, + "learning_rate": 6.430352497365377e-07, + "loss": 0.5026, + "step": 11455 + }, + { + "epoch": 0.89, + "grad_norm": 1.2250761027927906, + "learning_rate": 6.421490595714408e-07, + "loss": 0.4626, + "step": 11456 + }, + { + "epoch": 0.89, + "grad_norm": 1.0775728332084265, + "learning_rate": 6.412634602157519e-07, + "loss": 0.5068, + "step": 11457 + }, + { + "epoch": 0.89, + "grad_norm": 1.0999708106763129, + "learning_rate": 6.403784517253863e-07, + "loss": 0.4958, + "step": 11458 + }, + { + "epoch": 0.89, + "grad_norm": 1.1620736252871733, + "learning_rate": 6.394940341562173e-07, + "loss": 0.4906, + "step": 11459 + }, + { + "epoch": 0.89, + "grad_norm": 1.1828295062718412, + "learning_rate": 6.386102075640843e-07, + "loss": 0.4696, + "step": 11460 + }, + { + "epoch": 0.89, + "grad_norm": 1.2605761382468663, + "learning_rate": 6.377269720047863e-07, + "loss": 0.5236, + "step": 11461 + }, + { + "epoch": 0.89, + "grad_norm": 1.2409199419318881, + "learning_rate": 6.368443275340897e-07, + "loss": 0.5308, + "step": 11462 + }, + { + "epoch": 0.89, + "grad_norm": 1.2605009077099896, + "learning_rate": 6.359622742077165e-07, + "loss": 0.5165, + "step": 11463 + }, + { + "epoch": 0.89, + "grad_norm": 1.1387573524179118, + "learning_rate": 6.350808120813567e-07, + "loss": 0.4512, + "step": 11464 + }, + { + "epoch": 0.89, + "grad_norm": 1.3529922772657272, + "learning_rate": 6.341999412106625e-07, + "loss": 0.5448, + "step": 11465 + }, + { + "epoch": 0.89, + "grad_norm": 1.1773897239249582, + "learning_rate": 6.333196616512493e-07, + "loss": 0.5101, + "step": 11466 + }, + { + "epoch": 0.89, + "grad_norm": 1.2891223315600924, + "learning_rate": 6.324399734586884e-07, + "loss": 0.5234, + "step": 11467 + }, + { + "epoch": 0.89, + "grad_norm": 1.2360339063234378, + "learning_rate": 6.315608766885251e-07, + "loss": 0.4961, + "step": 11468 + }, + { + "epoch": 0.89, + "grad_norm": 1.1634495292572455, + "learning_rate": 6.306823713962563e-07, + "loss": 0.4774, + "step": 11469 + }, + { + "epoch": 0.89, + "grad_norm": 1.2369934509523381, + "learning_rate": 6.298044576373485e-07, + "loss": 0.5346, + "step": 11470 + }, + { + "epoch": 0.89, + "grad_norm": 1.1100206242612793, + "learning_rate": 6.289271354672299e-07, + "loss": 0.4141, + "step": 11471 + }, + { + "epoch": 0.89, + "grad_norm": 1.1974501599828717, + "learning_rate": 6.280504049412905e-07, + "loss": 0.5232, + "step": 11472 + }, + { + "epoch": 0.89, + "grad_norm": 1.2707960665298836, + "learning_rate": 6.271742661148806e-07, + "loss": 0.4956, + "step": 11473 + }, + { + "epoch": 0.89, + "grad_norm": 1.1678222258144697, + "learning_rate": 6.262987190433168e-07, + "loss": 0.4848, + "step": 11474 + }, + { + "epoch": 0.89, + "grad_norm": 1.3655513455498731, + "learning_rate": 6.254237637818772e-07, + "loss": 0.4862, + "step": 11475 + }, + { + "epoch": 0.89, + "grad_norm": 1.2370121466272352, + "learning_rate": 6.245494003858022e-07, + "loss": 0.5202, + "step": 11476 + }, + { + "epoch": 0.89, + "grad_norm": 1.1478723186273423, + "learning_rate": 6.236756289102941e-07, + "loss": 0.4699, + "step": 11477 + }, + { + "epoch": 0.89, + "grad_norm": 1.331742151831977, + "learning_rate": 6.2280244941052e-07, + "loss": 0.4812, + "step": 11478 + }, + { + "epoch": 0.89, + "grad_norm": 1.2106345997647863, + "learning_rate": 6.219298619416059e-07, + "loss": 0.5125, + "step": 11479 + }, + { + "epoch": 0.89, + "grad_norm": 1.185470000387417, + "learning_rate": 6.210578665586442e-07, + "loss": 0.5052, + "step": 11480 + }, + { + "epoch": 0.89, + "grad_norm": 1.212580691926514, + "learning_rate": 6.201864633166877e-07, + "loss": 0.5194, + "step": 11481 + }, + { + "epoch": 0.89, + "grad_norm": 1.2551719005990398, + "learning_rate": 6.193156522707555e-07, + "loss": 0.486, + "step": 11482 + }, + { + "epoch": 0.89, + "grad_norm": 1.2418040037475517, + "learning_rate": 6.184454334758227e-07, + "loss": 0.5533, + "step": 11483 + }, + { + "epoch": 0.89, + "grad_norm": 1.2998216084812626, + "learning_rate": 6.175758069868321e-07, + "loss": 0.5332, + "step": 11484 + }, + { + "epoch": 0.89, + "grad_norm": 1.230697416826139, + "learning_rate": 6.167067728586873e-07, + "loss": 0.5078, + "step": 11485 + }, + { + "epoch": 0.89, + "grad_norm": 1.2159069842350292, + "learning_rate": 6.158383311462568e-07, + "loss": 0.5583, + "step": 11486 + }, + { + "epoch": 0.89, + "grad_norm": 1.2315289970357401, + "learning_rate": 6.149704819043667e-07, + "loss": 0.5096, + "step": 11487 + }, + { + "epoch": 0.89, + "grad_norm": 1.2246629134404277, + "learning_rate": 6.141032251878132e-07, + "loss": 0.4753, + "step": 11488 + }, + { + "epoch": 0.89, + "grad_norm": 1.302875059386743, + "learning_rate": 6.132365610513457e-07, + "loss": 0.5786, + "step": 11489 + }, + { + "epoch": 0.89, + "grad_norm": 1.2691977194838018, + "learning_rate": 6.123704895496829e-07, + "loss": 0.4784, + "step": 11490 + }, + { + "epoch": 0.89, + "grad_norm": 1.1189072184138322, + "learning_rate": 6.115050107375053e-07, + "loss": 0.4325, + "step": 11491 + }, + { + "epoch": 0.89, + "grad_norm": 1.2353978325543034, + "learning_rate": 6.106401246694549e-07, + "loss": 0.4636, + "step": 11492 + }, + { + "epoch": 0.89, + "grad_norm": 1.2985534836510755, + "learning_rate": 6.097758314001379e-07, + "loss": 0.5072, + "step": 11493 + }, + { + "epoch": 0.89, + "grad_norm": 1.2031456090970258, + "learning_rate": 6.089121309841173e-07, + "loss": 0.4634, + "step": 11494 + }, + { + "epoch": 0.89, + "grad_norm": 1.1414363535958993, + "learning_rate": 6.080490234759262e-07, + "loss": 0.458, + "step": 11495 + }, + { + "epoch": 0.89, + "grad_norm": 1.2074600495073577, + "learning_rate": 6.071865089300577e-07, + "loss": 0.4909, + "step": 11496 + }, + { + "epoch": 0.89, + "grad_norm": 1.059273983350523, + "learning_rate": 6.063245874009638e-07, + "loss": 0.4521, + "step": 11497 + }, + { + "epoch": 0.89, + "grad_norm": 1.1210650397156736, + "learning_rate": 6.054632589430654e-07, + "loss": 0.484, + "step": 11498 + }, + { + "epoch": 0.89, + "grad_norm": 1.2262033313266074, + "learning_rate": 6.0460252361074e-07, + "loss": 0.5371, + "step": 11499 + }, + { + "epoch": 0.89, + "grad_norm": 1.251673198476342, + "learning_rate": 6.037423814583299e-07, + "loss": 0.5448, + "step": 11500 + }, + { + "epoch": 0.89, + "grad_norm": 1.2883652622087698, + "learning_rate": 6.028828325401426e-07, + "loss": 0.5224, + "step": 11501 + }, + { + "epoch": 0.89, + "grad_norm": 1.2830469509641087, + "learning_rate": 6.020238769104447e-07, + "loss": 0.4772, + "step": 11502 + }, + { + "epoch": 0.89, + "grad_norm": 1.263750312646663, + "learning_rate": 6.011655146234674e-07, + "loss": 0.5337, + "step": 11503 + }, + { + "epoch": 0.89, + "grad_norm": 1.1592515699792645, + "learning_rate": 6.003077457334017e-07, + "loss": 0.4858, + "step": 11504 + }, + { + "epoch": 0.89, + "grad_norm": 1.2008153370751402, + "learning_rate": 5.994505702944042e-07, + "loss": 0.5066, + "step": 11505 + }, + { + "epoch": 0.89, + "grad_norm": 1.2612124628959334, + "learning_rate": 5.98593988360594e-07, + "loss": 0.5015, + "step": 11506 + }, + { + "epoch": 0.89, + "grad_norm": 1.147990340806951, + "learning_rate": 5.977379999860488e-07, + "loss": 0.4738, + "step": 11507 + }, + { + "epoch": 0.89, + "grad_norm": 1.114273164795503, + "learning_rate": 5.968826052248145e-07, + "loss": 0.4502, + "step": 11508 + }, + { + "epoch": 0.89, + "grad_norm": 1.2976836934825, + "learning_rate": 5.960278041308931e-07, + "loss": 0.4902, + "step": 11509 + }, + { + "epoch": 0.89, + "grad_norm": 1.2014418880015592, + "learning_rate": 5.951735967582551e-07, + "loss": 0.5329, + "step": 11510 + }, + { + "epoch": 0.89, + "grad_norm": 1.2742328617289833, + "learning_rate": 5.943199831608304e-07, + "loss": 0.5202, + "step": 11511 + }, + { + "epoch": 0.89, + "grad_norm": 1.2587320982219765, + "learning_rate": 5.934669633925116e-07, + "loss": 0.4809, + "step": 11512 + }, + { + "epoch": 0.89, + "grad_norm": 1.16595699092592, + "learning_rate": 5.926145375071568e-07, + "loss": 0.4664, + "step": 11513 + }, + { + "epoch": 0.89, + "grad_norm": 1.1908485983601393, + "learning_rate": 5.917627055585807e-07, + "loss": 0.4664, + "step": 11514 + }, + { + "epoch": 0.89, + "grad_norm": 1.157351613430679, + "learning_rate": 5.909114676005645e-07, + "loss": 0.462, + "step": 11515 + }, + { + "epoch": 0.89, + "grad_norm": 1.277930635256316, + "learning_rate": 5.900608236868532e-07, + "loss": 0.5313, + "step": 11516 + }, + { + "epoch": 0.89, + "grad_norm": 1.4648454589833781, + "learning_rate": 5.892107738711505e-07, + "loss": 0.5294, + "step": 11517 + }, + { + "epoch": 0.89, + "grad_norm": 1.2096183236361553, + "learning_rate": 5.883613182071257e-07, + "loss": 0.5709, + "step": 11518 + }, + { + "epoch": 0.89, + "grad_norm": 1.1865059054613942, + "learning_rate": 5.875124567484069e-07, + "loss": 0.5131, + "step": 11519 + }, + { + "epoch": 0.89, + "grad_norm": 1.148821137376876, + "learning_rate": 5.866641895485892e-07, + "loss": 0.4299, + "step": 11520 + }, + { + "epoch": 0.89, + "grad_norm": 1.1251920430196, + "learning_rate": 5.858165166612273e-07, + "loss": 0.4955, + "step": 11521 + }, + { + "epoch": 0.89, + "grad_norm": 1.195691366552233, + "learning_rate": 5.849694381398396e-07, + "loss": 0.544, + "step": 11522 + }, + { + "epoch": 0.89, + "grad_norm": 1.1546910060060207, + "learning_rate": 5.841229540379079e-07, + "loss": 0.5319, + "step": 11523 + }, + { + "epoch": 0.89, + "grad_norm": 1.1479977135452735, + "learning_rate": 5.832770644088726e-07, + "loss": 0.5128, + "step": 11524 + }, + { + "epoch": 0.89, + "grad_norm": 1.2128555857930658, + "learning_rate": 5.824317693061387e-07, + "loss": 0.4753, + "step": 11525 + }, + { + "epoch": 0.89, + "grad_norm": 1.241292619155997, + "learning_rate": 5.815870687830782e-07, + "loss": 0.5141, + "step": 11526 + }, + { + "epoch": 0.89, + "grad_norm": 1.1653126397801563, + "learning_rate": 5.807429628930172e-07, + "loss": 0.4541, + "step": 11527 + }, + { + "epoch": 0.89, + "grad_norm": 1.177684523506373, + "learning_rate": 5.798994516892509e-07, + "loss": 0.5136, + "step": 11528 + }, + { + "epoch": 0.89, + "grad_norm": 1.0739360090439753, + "learning_rate": 5.790565352250322e-07, + "loss": 0.4287, + "step": 11529 + }, + { + "epoch": 0.89, + "grad_norm": 1.301586646397559, + "learning_rate": 5.782142135535806e-07, + "loss": 0.4968, + "step": 11530 + }, + { + "epoch": 0.89, + "grad_norm": 1.1061639655327724, + "learning_rate": 5.77372486728075e-07, + "loss": 0.4648, + "step": 11531 + }, + { + "epoch": 0.89, + "grad_norm": 1.1582700250097908, + "learning_rate": 5.765313548016593e-07, + "loss": 0.5057, + "step": 11532 + }, + { + "epoch": 0.89, + "grad_norm": 1.3245410808213256, + "learning_rate": 5.756908178274389e-07, + "loss": 0.5502, + "step": 11533 + }, + { + "epoch": 0.89, + "grad_norm": 1.1218823042182844, + "learning_rate": 5.748508758584792e-07, + "loss": 0.4495, + "step": 11534 + }, + { + "epoch": 0.89, + "grad_norm": 1.2571300289967215, + "learning_rate": 5.740115289478109e-07, + "loss": 0.4927, + "step": 11535 + }, + { + "epoch": 0.89, + "grad_norm": 1.2030677781802523, + "learning_rate": 5.731727771484275e-07, + "loss": 0.5042, + "step": 11536 + }, + { + "epoch": 0.9, + "grad_norm": 1.1734218814790778, + "learning_rate": 5.723346205132818e-07, + "loss": 0.5223, + "step": 11537 + }, + { + "epoch": 0.9, + "grad_norm": 1.3448370595175896, + "learning_rate": 5.714970590952939e-07, + "loss": 0.5226, + "step": 11538 + }, + { + "epoch": 0.9, + "grad_norm": 1.0855007253791147, + "learning_rate": 5.706600929473382e-07, + "loss": 0.4566, + "step": 11539 + }, + { + "epoch": 0.9, + "grad_norm": 1.2138332076775586, + "learning_rate": 5.698237221222614e-07, + "loss": 0.5171, + "step": 11540 + }, + { + "epoch": 0.9, + "grad_norm": 1.2539276881776915, + "learning_rate": 5.689879466728654e-07, + "loss": 0.5312, + "step": 11541 + }, + { + "epoch": 0.9, + "grad_norm": 1.3131047626662606, + "learning_rate": 5.681527666519182e-07, + "loss": 0.5372, + "step": 11542 + }, + { + "epoch": 0.9, + "grad_norm": 1.2661229319706675, + "learning_rate": 5.673181821121509e-07, + "loss": 0.4981, + "step": 11543 + }, + { + "epoch": 0.9, + "grad_norm": 1.1311052809400346, + "learning_rate": 5.664841931062504e-07, + "loss": 0.5265, + "step": 11544 + }, + { + "epoch": 0.9, + "grad_norm": 1.2112644861573485, + "learning_rate": 5.656507996868743e-07, + "loss": 0.4923, + "step": 11545 + }, + { + "epoch": 0.9, + "grad_norm": 1.1716073811446706, + "learning_rate": 5.648180019066385e-07, + "loss": 0.5145, + "step": 11546 + }, + { + "epoch": 0.9, + "grad_norm": 1.033528786806913, + "learning_rate": 5.639857998181208e-07, + "loss": 0.4607, + "step": 11547 + }, + { + "epoch": 0.9, + "grad_norm": 1.071128589778587, + "learning_rate": 5.631541934738627e-07, + "loss": 0.4024, + "step": 11548 + }, + { + "epoch": 0.9, + "grad_norm": 1.1293005865056507, + "learning_rate": 5.623231829263698e-07, + "loss": 0.4428, + "step": 11549 + }, + { + "epoch": 0.9, + "grad_norm": 1.0832866450788494, + "learning_rate": 5.614927682281046e-07, + "loss": 0.4554, + "step": 11550 + }, + { + "epoch": 0.9, + "grad_norm": 1.252620239577666, + "learning_rate": 5.606629494314963e-07, + "loss": 0.4963, + "step": 11551 + }, + { + "epoch": 0.9, + "grad_norm": 1.3517702858025151, + "learning_rate": 5.598337265889375e-07, + "loss": 0.5377, + "step": 11552 + }, + { + "epoch": 0.9, + "grad_norm": 1.2697677626744661, + "learning_rate": 5.590050997527808e-07, + "loss": 0.4895, + "step": 11553 + }, + { + "epoch": 0.9, + "grad_norm": 1.0985110067025359, + "learning_rate": 5.581770689753429e-07, + "loss": 0.438, + "step": 11554 + }, + { + "epoch": 0.9, + "grad_norm": 1.1086773895262008, + "learning_rate": 5.57349634308898e-07, + "loss": 0.4963, + "step": 11555 + }, + { + "epoch": 0.9, + "grad_norm": 1.2357812910553831, + "learning_rate": 5.565227958056896e-07, + "loss": 0.4954, + "step": 11556 + }, + { + "epoch": 0.9, + "grad_norm": 1.1912208788135292, + "learning_rate": 5.556965535179182e-07, + "loss": 0.4829, + "step": 11557 + }, + { + "epoch": 0.9, + "grad_norm": 1.3335518757213265, + "learning_rate": 5.548709074977487e-07, + "loss": 0.5646, + "step": 11558 + }, + { + "epoch": 0.9, + "grad_norm": 1.3082056181625616, + "learning_rate": 5.540458577973118e-07, + "loss": 0.5054, + "step": 11559 + }, + { + "epoch": 0.9, + "grad_norm": 1.2630733151363325, + "learning_rate": 5.532214044686923e-07, + "loss": 0.4922, + "step": 11560 + }, + { + "epoch": 0.9, + "grad_norm": 1.1330235251823495, + "learning_rate": 5.523975475639443e-07, + "loss": 0.4787, + "step": 11561 + }, + { + "epoch": 0.9, + "grad_norm": 1.173797708583798, + "learning_rate": 5.515742871350838e-07, + "loss": 0.4792, + "step": 11562 + }, + { + "epoch": 0.9, + "grad_norm": 1.1836511075565557, + "learning_rate": 5.507516232340848e-07, + "loss": 0.4875, + "step": 11563 + }, + { + "epoch": 0.9, + "grad_norm": 1.191391841613695, + "learning_rate": 5.499295559128892e-07, + "loss": 0.4711, + "step": 11564 + }, + { + "epoch": 0.9, + "grad_norm": 1.2340279586258522, + "learning_rate": 5.491080852233955e-07, + "loss": 0.5353, + "step": 11565 + }, + { + "epoch": 0.9, + "grad_norm": 1.1752932223235342, + "learning_rate": 5.482872112174698e-07, + "loss": 0.4687, + "step": 11566 + }, + { + "epoch": 0.9, + "grad_norm": 1.2523062411835026, + "learning_rate": 5.474669339469351e-07, + "loss": 0.4597, + "step": 11567 + }, + { + "epoch": 0.9, + "grad_norm": 1.148199926183579, + "learning_rate": 5.466472534635814e-07, + "loss": 0.5053, + "step": 11568 + }, + { + "epoch": 0.9, + "grad_norm": 1.2781162084449653, + "learning_rate": 5.458281698191615e-07, + "loss": 0.5407, + "step": 11569 + }, + { + "epoch": 0.9, + "grad_norm": 1.2211590945282969, + "learning_rate": 5.450096830653851e-07, + "loss": 0.4956, + "step": 11570 + }, + { + "epoch": 0.9, + "grad_norm": 1.2045974638426897, + "learning_rate": 5.441917932539287e-07, + "loss": 0.5199, + "step": 11571 + }, + { + "epoch": 0.9, + "grad_norm": 1.1920544971354834, + "learning_rate": 5.433745004364299e-07, + "loss": 0.5059, + "step": 11572 + }, + { + "epoch": 0.9, + "grad_norm": 1.1258671385637893, + "learning_rate": 5.425578046644886e-07, + "loss": 0.4482, + "step": 11573 + }, + { + "epoch": 0.9, + "grad_norm": 1.3430994810127084, + "learning_rate": 5.417417059896679e-07, + "loss": 0.5298, + "step": 11574 + }, + { + "epoch": 0.9, + "grad_norm": 1.30765711728383, + "learning_rate": 5.4092620446349e-07, + "loss": 0.5017, + "step": 11575 + }, + { + "epoch": 0.9, + "grad_norm": 1.3845164689572027, + "learning_rate": 5.401113001374459e-07, + "loss": 0.5462, + "step": 11576 + }, + { + "epoch": 0.9, + "grad_norm": 1.2505986687415667, + "learning_rate": 5.392969930629799e-07, + "loss": 0.4704, + "step": 11577 + }, + { + "epoch": 0.9, + "grad_norm": 1.1860635501985373, + "learning_rate": 5.384832832915055e-07, + "loss": 0.4972, + "step": 11578 + }, + { + "epoch": 0.9, + "grad_norm": 1.2304688468811966, + "learning_rate": 5.376701708743981e-07, + "loss": 0.515, + "step": 11579 + }, + { + "epoch": 0.9, + "grad_norm": 1.0690601010534004, + "learning_rate": 5.368576558629901e-07, + "loss": 0.4622, + "step": 11580 + }, + { + "epoch": 0.9, + "grad_norm": 1.2907584853335778, + "learning_rate": 5.360457383085816e-07, + "loss": 0.5495, + "step": 11581 + }, + { + "epoch": 0.9, + "grad_norm": 1.3180428584168025, + "learning_rate": 5.352344182624336e-07, + "loss": 0.5582, + "step": 11582 + }, + { + "epoch": 0.9, + "grad_norm": 1.2552603662029, + "learning_rate": 5.344236957757687e-07, + "loss": 0.5372, + "step": 11583 + }, + { + "epoch": 0.9, + "grad_norm": 1.2918543781762506, + "learning_rate": 5.336135708997725e-07, + "loss": 0.5407, + "step": 11584 + }, + { + "epoch": 0.9, + "grad_norm": 1.054848778720736, + "learning_rate": 5.328040436855908e-07, + "loss": 0.453, + "step": 11585 + }, + { + "epoch": 0.9, + "grad_norm": 1.310663573389164, + "learning_rate": 5.31995114184336e-07, + "loss": 0.5397, + "step": 11586 + }, + { + "epoch": 0.9, + "grad_norm": 1.2875970544621982, + "learning_rate": 5.311867824470762e-07, + "loss": 0.5238, + "step": 11587 + }, + { + "epoch": 0.9, + "grad_norm": 1.1991838083550863, + "learning_rate": 5.303790485248472e-07, + "loss": 0.5118, + "step": 11588 + }, + { + "epoch": 0.9, + "grad_norm": 1.1703311159820398, + "learning_rate": 5.295719124686482e-07, + "loss": 0.4961, + "step": 11589 + }, + { + "epoch": 0.9, + "grad_norm": 1.1631919114320926, + "learning_rate": 5.28765374329433e-07, + "loss": 0.4735, + "step": 11590 + }, + { + "epoch": 0.9, + "grad_norm": 1.2684882941722748, + "learning_rate": 5.279594341581252e-07, + "loss": 0.4885, + "step": 11591 + }, + { + "epoch": 0.9, + "grad_norm": 1.2766723734941412, + "learning_rate": 5.271540920056073e-07, + "loss": 0.5179, + "step": 11592 + }, + { + "epoch": 0.9, + "grad_norm": 1.1883534326142073, + "learning_rate": 5.263493479227255e-07, + "loss": 0.4671, + "step": 11593 + }, + { + "epoch": 0.9, + "grad_norm": 1.2319199012845141, + "learning_rate": 5.25545201960288e-07, + "loss": 0.5094, + "step": 11594 + }, + { + "epoch": 0.9, + "grad_norm": 1.2287588197191746, + "learning_rate": 5.247416541690642e-07, + "loss": 0.4652, + "step": 11595 + }, + { + "epoch": 0.9, + "grad_norm": 1.476701517978316, + "learning_rate": 5.239387045997835e-07, + "loss": 0.5068, + "step": 11596 + }, + { + "epoch": 0.9, + "grad_norm": 1.163370989533444, + "learning_rate": 5.231363533031431e-07, + "loss": 0.4676, + "step": 11597 + }, + { + "epoch": 0.9, + "grad_norm": 1.2597525189331573, + "learning_rate": 5.22334600329798e-07, + "loss": 0.521, + "step": 11598 + }, + { + "epoch": 0.9, + "grad_norm": 1.325676669669993, + "learning_rate": 5.2153344573037e-07, + "loss": 0.4791, + "step": 11599 + }, + { + "epoch": 0.9, + "grad_norm": 1.256546378403438, + "learning_rate": 5.207328895554365e-07, + "loss": 0.4921, + "step": 11600 + }, + { + "epoch": 0.9, + "grad_norm": 1.3392610756560481, + "learning_rate": 5.199329318555424e-07, + "loss": 0.5081, + "step": 11601 + }, + { + "epoch": 0.9, + "grad_norm": 1.20082164093368, + "learning_rate": 5.191335726811931e-07, + "loss": 0.4769, + "step": 11602 + }, + { + "epoch": 0.9, + "grad_norm": 1.1291575432628187, + "learning_rate": 5.183348120828558e-07, + "loss": 0.4957, + "step": 11603 + }, + { + "epoch": 0.9, + "grad_norm": 1.1954025035208389, + "learning_rate": 5.175366501109625e-07, + "loss": 0.4782, + "step": 11604 + }, + { + "epoch": 0.9, + "grad_norm": 1.0798469834188902, + "learning_rate": 5.167390868159028e-07, + "loss": 0.4801, + "step": 11605 + }, + { + "epoch": 0.9, + "grad_norm": 1.3001650705440604, + "learning_rate": 5.15942122248031e-07, + "loss": 0.4899, + "step": 11606 + }, + { + "epoch": 0.9, + "grad_norm": 1.2203706578501075, + "learning_rate": 5.151457564576645e-07, + "loss": 0.5215, + "step": 11607 + }, + { + "epoch": 0.9, + "grad_norm": 1.1625401828342996, + "learning_rate": 5.14349989495082e-07, + "loss": 0.4965, + "step": 11608 + }, + { + "epoch": 0.9, + "grad_norm": 1.1946233620316558, + "learning_rate": 5.135548214105235e-07, + "loss": 0.5228, + "step": 11609 + }, + { + "epoch": 0.9, + "grad_norm": 1.171749108228106, + "learning_rate": 5.127602522541942e-07, + "loss": 0.4461, + "step": 11610 + }, + { + "epoch": 0.9, + "grad_norm": 1.240285172511022, + "learning_rate": 5.119662820762572e-07, + "loss": 0.5171, + "step": 11611 + }, + { + "epoch": 0.9, + "grad_norm": 1.2563639764656076, + "learning_rate": 5.111729109268405e-07, + "loss": 0.5591, + "step": 11612 + }, + { + "epoch": 0.9, + "grad_norm": 1.2266489472266957, + "learning_rate": 5.103801388560337e-07, + "loss": 0.5103, + "step": 11613 + }, + { + "epoch": 0.9, + "grad_norm": 1.097857853848195, + "learning_rate": 5.095879659138892e-07, + "loss": 0.4718, + "step": 11614 + }, + { + "epoch": 0.9, + "grad_norm": 1.306902165032515, + "learning_rate": 5.087963921504213e-07, + "loss": 0.5301, + "step": 11615 + }, + { + "epoch": 0.9, + "grad_norm": 1.3801666637344339, + "learning_rate": 5.080054176156035e-07, + "loss": 0.5312, + "step": 11616 + }, + { + "epoch": 0.9, + "grad_norm": 1.1768228979828945, + "learning_rate": 5.072150423593769e-07, + "loss": 0.5405, + "step": 11617 + }, + { + "epoch": 0.9, + "grad_norm": 1.097660051115249, + "learning_rate": 5.064252664316405e-07, + "loss": 0.4732, + "step": 11618 + }, + { + "epoch": 0.9, + "grad_norm": 1.2503430849361337, + "learning_rate": 5.056360898822577e-07, + "loss": 0.5135, + "step": 11619 + }, + { + "epoch": 0.9, + "grad_norm": 1.16105427007145, + "learning_rate": 5.048475127610531e-07, + "loss": 0.485, + "step": 11620 + }, + { + "epoch": 0.9, + "grad_norm": 1.2480716613314087, + "learning_rate": 5.040595351178134e-07, + "loss": 0.4756, + "step": 11621 + }, + { + "epoch": 0.9, + "grad_norm": 1.330302349801889, + "learning_rate": 5.032721570022881e-07, + "loss": 0.5101, + "step": 11622 + }, + { + "epoch": 0.9, + "grad_norm": 1.1913059880327987, + "learning_rate": 5.02485378464187e-07, + "loss": 0.5096, + "step": 11623 + }, + { + "epoch": 0.9, + "grad_norm": 1.1441361718629999, + "learning_rate": 5.016991995531872e-07, + "loss": 0.4722, + "step": 11624 + }, + { + "epoch": 0.9, + "grad_norm": 1.117943141385377, + "learning_rate": 5.009136203189214e-07, + "loss": 0.4632, + "step": 11625 + }, + { + "epoch": 0.9, + "grad_norm": 1.2158038402498605, + "learning_rate": 5.001286408109862e-07, + "loss": 0.5401, + "step": 11626 + }, + { + "epoch": 0.9, + "grad_norm": 1.1248110506540474, + "learning_rate": 4.993442610789423e-07, + "loss": 0.503, + "step": 11627 + }, + { + "epoch": 0.9, + "grad_norm": 1.333943356599647, + "learning_rate": 4.985604811723133e-07, + "loss": 0.5397, + "step": 11628 + }, + { + "epoch": 0.9, + "grad_norm": 1.230414882873489, + "learning_rate": 4.977773011405806e-07, + "loss": 0.4974, + "step": 11629 + }, + { + "epoch": 0.9, + "grad_norm": 1.2653788103443642, + "learning_rate": 4.969947210331938e-07, + "loss": 0.542, + "step": 11630 + }, + { + "epoch": 0.9, + "grad_norm": 1.094276301597115, + "learning_rate": 4.962127408995587e-07, + "loss": 0.4612, + "step": 11631 + }, + { + "epoch": 0.9, + "grad_norm": 1.1960067727366028, + "learning_rate": 4.95431360789046e-07, + "loss": 0.469, + "step": 11632 + }, + { + "epoch": 0.9, + "grad_norm": 1.20365740403089, + "learning_rate": 4.946505807509883e-07, + "loss": 0.487, + "step": 11633 + }, + { + "epoch": 0.9, + "grad_norm": 1.2335867960699443, + "learning_rate": 4.938704008346818e-07, + "loss": 0.5124, + "step": 11634 + }, + { + "epoch": 0.9, + "grad_norm": 1.2364535155144885, + "learning_rate": 4.930908210893826e-07, + "loss": 0.4667, + "step": 11635 + }, + { + "epoch": 0.9, + "grad_norm": 1.3809632258232114, + "learning_rate": 4.92311841564308e-07, + "loss": 0.562, + "step": 11636 + }, + { + "epoch": 0.9, + "grad_norm": 1.1480338496624145, + "learning_rate": 4.915334623086387e-07, + "loss": 0.4954, + "step": 11637 + }, + { + "epoch": 0.9, + "grad_norm": 1.172027832237827, + "learning_rate": 4.907556833715199e-07, + "loss": 0.4747, + "step": 11638 + }, + { + "epoch": 0.9, + "grad_norm": 1.2427988047680911, + "learning_rate": 4.899785048020567e-07, + "loss": 0.4948, + "step": 11639 + }, + { + "epoch": 0.9, + "grad_norm": 1.170583241904792, + "learning_rate": 4.892019266493164e-07, + "loss": 0.5279, + "step": 11640 + }, + { + "epoch": 0.9, + "grad_norm": 1.196469314385757, + "learning_rate": 4.884259489623267e-07, + "loss": 0.475, + "step": 11641 + }, + { + "epoch": 0.9, + "grad_norm": 1.1514619718691659, + "learning_rate": 4.876505717900803e-07, + "loss": 0.4724, + "step": 11642 + }, + { + "epoch": 0.9, + "grad_norm": 1.2511098226403587, + "learning_rate": 4.868757951815295e-07, + "loss": 0.5248, + "step": 11643 + }, + { + "epoch": 0.9, + "grad_norm": 1.3178662091193285, + "learning_rate": 4.861016191855939e-07, + "loss": 0.5234, + "step": 11644 + }, + { + "epoch": 0.9, + "grad_norm": 1.2354141400436287, + "learning_rate": 4.853280438511476e-07, + "loss": 0.5092, + "step": 11645 + }, + { + "epoch": 0.9, + "grad_norm": 1.098511440778354, + "learning_rate": 4.845550692270296e-07, + "loss": 0.498, + "step": 11646 + }, + { + "epoch": 0.9, + "grad_norm": 1.2128744078210814, + "learning_rate": 4.837826953620428e-07, + "loss": 0.4699, + "step": 11647 + }, + { + "epoch": 0.9, + "grad_norm": 1.1940351654893464, + "learning_rate": 4.830109223049528e-07, + "loss": 0.4796, + "step": 11648 + }, + { + "epoch": 0.9, + "grad_norm": 1.2091029883946196, + "learning_rate": 4.82239750104484e-07, + "loss": 0.5288, + "step": 11649 + }, + { + "epoch": 0.9, + "grad_norm": 1.1562578871174463, + "learning_rate": 4.814691788093262e-07, + "loss": 0.4894, + "step": 11650 + }, + { + "epoch": 0.9, + "grad_norm": 1.3747158190356996, + "learning_rate": 4.806992084681273e-07, + "loss": 0.5272, + "step": 11651 + }, + { + "epoch": 0.9, + "grad_norm": 1.2156153739803044, + "learning_rate": 4.799298391295004e-07, + "loss": 0.5203, + "step": 11652 + }, + { + "epoch": 0.9, + "grad_norm": 1.1915433210724786, + "learning_rate": 4.791610708420192e-07, + "loss": 0.4939, + "step": 11653 + }, + { + "epoch": 0.9, + "grad_norm": 1.3170288605826161, + "learning_rate": 4.783929036542234e-07, + "loss": 0.5232, + "step": 11654 + }, + { + "epoch": 0.9, + "grad_norm": 1.2679209664320248, + "learning_rate": 4.776253376146078e-07, + "loss": 0.5041, + "step": 11655 + }, + { + "epoch": 0.9, + "grad_norm": 1.1599221041438859, + "learning_rate": 4.768583727716314e-07, + "loss": 0.4681, + "step": 11656 + }, + { + "epoch": 0.9, + "grad_norm": 1.1336253965601941, + "learning_rate": 4.760920091737198e-07, + "loss": 0.473, + "step": 11657 + }, + { + "epoch": 0.9, + "grad_norm": 1.1305282186962131, + "learning_rate": 4.7532624686925655e-07, + "loss": 0.4607, + "step": 11658 + }, + { + "epoch": 0.9, + "grad_norm": 1.1431852374257534, + "learning_rate": 4.745610859065886e-07, + "loss": 0.4501, + "step": 11659 + }, + { + "epoch": 0.9, + "grad_norm": 1.3306119634546028, + "learning_rate": 4.7379652633402386e-07, + "loss": 0.4928, + "step": 11660 + }, + { + "epoch": 0.9, + "grad_norm": 1.1978248173980799, + "learning_rate": 4.730325681998338e-07, + "loss": 0.5595, + "step": 11661 + }, + { + "epoch": 0.9, + "grad_norm": 1.219627015913165, + "learning_rate": 4.722692115522498e-07, + "loss": 0.5345, + "step": 11662 + }, + { + "epoch": 0.9, + "grad_norm": 1.223947982203224, + "learning_rate": 4.715064564394667e-07, + "loss": 0.5003, + "step": 11663 + }, + { + "epoch": 0.9, + "grad_norm": 1.2324123472882256, + "learning_rate": 4.707443029096437e-07, + "loss": 0.457, + "step": 11664 + }, + { + "epoch": 0.9, + "grad_norm": 1.0759650445315745, + "learning_rate": 4.6998275101089454e-07, + "loss": 0.4797, + "step": 11665 + }, + { + "epoch": 0.91, + "grad_norm": 1.185037468909366, + "learning_rate": 4.692218007913052e-07, + "loss": 0.5174, + "step": 11666 + }, + { + "epoch": 0.91, + "grad_norm": 1.2233893783284107, + "learning_rate": 4.6846145229891393e-07, + "loss": 0.4962, + "step": 11667 + }, + { + "epoch": 0.91, + "grad_norm": 1.2070307561879552, + "learning_rate": 4.677017055817279e-07, + "loss": 0.498, + "step": 11668 + }, + { + "epoch": 0.91, + "grad_norm": 1.306967017401899, + "learning_rate": 4.6694256068771314e-07, + "loss": 0.5078, + "step": 11669 + }, + { + "epoch": 0.91, + "grad_norm": 1.1385053509606156, + "learning_rate": 4.661840176647991e-07, + "loss": 0.4506, + "step": 11670 + }, + { + "epoch": 0.91, + "grad_norm": 1.260228981146994, + "learning_rate": 4.654260765608776e-07, + "loss": 0.5134, + "step": 11671 + }, + { + "epoch": 0.91, + "grad_norm": 1.2414791558705525, + "learning_rate": 4.646687374237979e-07, + "loss": 0.481, + "step": 11672 + }, + { + "epoch": 0.91, + "grad_norm": 1.224950569479002, + "learning_rate": 4.639120003013764e-07, + "loss": 0.4756, + "step": 11673 + }, + { + "epoch": 0.91, + "grad_norm": 1.2187451093526673, + "learning_rate": 4.6315586524139143e-07, + "loss": 0.537, + "step": 11674 + }, + { + "epoch": 0.91, + "grad_norm": 1.3131997649938285, + "learning_rate": 4.6240033229157934e-07, + "loss": 0.563, + "step": 11675 + }, + { + "epoch": 0.91, + "grad_norm": 1.1431704819663746, + "learning_rate": 4.61645401499643e-07, + "loss": 0.4919, + "step": 11676 + }, + { + "epoch": 0.91, + "grad_norm": 1.1505580190943998, + "learning_rate": 4.608910729132432e-07, + "loss": 0.4722, + "step": 11677 + }, + { + "epoch": 0.91, + "grad_norm": 1.1179001142651064, + "learning_rate": 4.6013734658000406e-07, + "loss": 0.4819, + "step": 11678 + }, + { + "epoch": 0.91, + "grad_norm": 1.2458995799352273, + "learning_rate": 4.5938422254751425e-07, + "loss": 0.5234, + "step": 11679 + }, + { + "epoch": 0.91, + "grad_norm": 1.1915886411042884, + "learning_rate": 4.5863170086332234e-07, + "loss": 0.5037, + "step": 11680 + }, + { + "epoch": 0.91, + "grad_norm": 1.1996697189869534, + "learning_rate": 4.578797815749381e-07, + "loss": 0.4958, + "step": 11681 + }, + { + "epoch": 0.91, + "grad_norm": 1.3254755858047529, + "learning_rate": 4.571284647298335e-07, + "loss": 0.5179, + "step": 11682 + }, + { + "epoch": 0.91, + "grad_norm": 1.4197493373562327, + "learning_rate": 4.5637775037544296e-07, + "loss": 0.5397, + "step": 11683 + }, + { + "epoch": 0.91, + "grad_norm": 1.1658080665480766, + "learning_rate": 4.556276385591663e-07, + "loss": 0.4958, + "step": 11684 + }, + { + "epoch": 0.91, + "grad_norm": 1.196314472784978, + "learning_rate": 4.548781293283566e-07, + "loss": 0.4759, + "step": 11685 + }, + { + "epoch": 0.91, + "grad_norm": 1.084411152173807, + "learning_rate": 4.541292227303384e-07, + "loss": 0.4522, + "step": 11686 + }, + { + "epoch": 0.91, + "grad_norm": 1.252107131697988, + "learning_rate": 4.533809188123917e-07, + "loss": 0.5192, + "step": 11687 + }, + { + "epoch": 0.91, + "grad_norm": 1.241396430176583, + "learning_rate": 4.526332176217618e-07, + "loss": 0.5326, + "step": 11688 + }, + { + "epoch": 0.91, + "grad_norm": 1.1364092848018812, + "learning_rate": 4.518861192056545e-07, + "loss": 0.4621, + "step": 11689 + }, + { + "epoch": 0.91, + "grad_norm": 1.3141722697926925, + "learning_rate": 4.511396236112386e-07, + "loss": 0.4955, + "step": 11690 + }, + { + "epoch": 0.91, + "grad_norm": 1.2908912864286366, + "learning_rate": 4.503937308856454e-07, + "loss": 0.5272, + "step": 11691 + }, + { + "epoch": 0.91, + "grad_norm": 1.1712128866823428, + "learning_rate": 4.4964844107596384e-07, + "loss": 0.4973, + "step": 11692 + }, + { + "epoch": 0.91, + "grad_norm": 1.1982866572776498, + "learning_rate": 4.489037542292507e-07, + "loss": 0.4444, + "step": 11693 + }, + { + "epoch": 0.91, + "grad_norm": 1.172101164292414, + "learning_rate": 4.481596703925195e-07, + "loss": 0.4915, + "step": 11694 + }, + { + "epoch": 0.91, + "grad_norm": 1.2125661419004228, + "learning_rate": 4.4741618961274936e-07, + "loss": 0.4956, + "step": 11695 + }, + { + "epoch": 0.91, + "grad_norm": 1.153808335454769, + "learning_rate": 4.4667331193688155e-07, + "loss": 0.4789, + "step": 11696 + }, + { + "epoch": 0.91, + "grad_norm": 1.1952670718268237, + "learning_rate": 4.459310374118142e-07, + "loss": 0.4465, + "step": 11697 + }, + { + "epoch": 0.91, + "grad_norm": 1.3161940587573808, + "learning_rate": 4.451893660844142e-07, + "loss": 0.4697, + "step": 11698 + }, + { + "epoch": 0.91, + "grad_norm": 1.2213759376304432, + "learning_rate": 4.4444829800150524e-07, + "loss": 0.4709, + "step": 11699 + }, + { + "epoch": 0.91, + "grad_norm": 1.192370315473772, + "learning_rate": 4.437078332098754e-07, + "loss": 0.4807, + "step": 11700 + }, + { + "epoch": 0.91, + "grad_norm": 1.2790178067367342, + "learning_rate": 4.4296797175627517e-07, + "loss": 0.5051, + "step": 11701 + }, + { + "epoch": 0.91, + "grad_norm": 1.199433347903852, + "learning_rate": 4.422287136874126e-07, + "loss": 0.472, + "step": 11702 + }, + { + "epoch": 0.91, + "grad_norm": 1.2479101353868722, + "learning_rate": 4.41490059049966e-07, + "loss": 0.5427, + "step": 11703 + }, + { + "epoch": 0.91, + "grad_norm": 1.159155005792415, + "learning_rate": 4.407520078905647e-07, + "loss": 0.4453, + "step": 11704 + }, + { + "epoch": 0.91, + "grad_norm": 1.225054402992305, + "learning_rate": 4.4001456025580925e-07, + "loss": 0.5116, + "step": 11705 + }, + { + "epoch": 0.91, + "grad_norm": 1.1330927535032986, + "learning_rate": 4.3927771619225787e-07, + "loss": 0.4546, + "step": 11706 + }, + { + "epoch": 0.91, + "grad_norm": 1.3007622794752653, + "learning_rate": 4.385414757464312e-07, + "loss": 0.4898, + "step": 11707 + }, + { + "epoch": 0.91, + "grad_norm": 1.2646846810232406, + "learning_rate": 4.37805838964811e-07, + "loss": 0.5187, + "step": 11708 + }, + { + "epoch": 0.91, + "grad_norm": 1.2703951669143438, + "learning_rate": 4.370708058938422e-07, + "loss": 0.5491, + "step": 11709 + }, + { + "epoch": 0.91, + "grad_norm": 1.1870486757024492, + "learning_rate": 4.363363765799322e-07, + "loss": 0.5152, + "step": 11710 + }, + { + "epoch": 0.91, + "grad_norm": 1.098214167197262, + "learning_rate": 4.356025510694495e-07, + "loss": 0.5115, + "step": 11711 + }, + { + "epoch": 0.91, + "grad_norm": 1.2990648555848965, + "learning_rate": 4.348693294087236e-07, + "loss": 0.5695, + "step": 11712 + }, + { + "epoch": 0.91, + "grad_norm": 1.1769447526402916, + "learning_rate": 4.3413671164404757e-07, + "loss": 0.5, + "step": 11713 + }, + { + "epoch": 0.91, + "grad_norm": 1.2447782646106818, + "learning_rate": 4.3340469782167214e-07, + "loss": 0.476, + "step": 11714 + }, + { + "epoch": 0.91, + "grad_norm": 1.2250338705401749, + "learning_rate": 4.3267328798781595e-07, + "loss": 0.5429, + "step": 11715 + }, + { + "epoch": 0.91, + "grad_norm": 1.3735424466012531, + "learning_rate": 4.319424821886553e-07, + "loss": 0.5384, + "step": 11716 + }, + { + "epoch": 0.91, + "grad_norm": 1.1890178817418209, + "learning_rate": 4.3121228047033227e-07, + "loss": 0.4781, + "step": 11717 + }, + { + "epoch": 0.91, + "grad_norm": 1.157881384896909, + "learning_rate": 4.3048268287894435e-07, + "loss": 0.4957, + "step": 11718 + }, + { + "epoch": 0.91, + "grad_norm": 1.2604530056005232, + "learning_rate": 4.2975368946055805e-07, + "loss": 0.4931, + "step": 11719 + }, + { + "epoch": 0.91, + "grad_norm": 1.2466276932215556, + "learning_rate": 4.290253002611966e-07, + "loss": 0.4998, + "step": 11720 + }, + { + "epoch": 0.91, + "grad_norm": 1.1932600763591998, + "learning_rate": 4.282975153268476e-07, + "loss": 0.4837, + "step": 11721 + }, + { + "epoch": 0.91, + "grad_norm": 1.2554693254090885, + "learning_rate": 4.275703347034621e-07, + "loss": 0.558, + "step": 11722 + }, + { + "epoch": 0.91, + "grad_norm": 1.3863700750715937, + "learning_rate": 4.268437584369478e-07, + "loss": 0.5683, + "step": 11723 + }, + { + "epoch": 0.91, + "grad_norm": 1.2086752648904282, + "learning_rate": 4.2611778657317695e-07, + "loss": 0.5212, + "step": 11724 + }, + { + "epoch": 0.91, + "grad_norm": 1.1653620998704188, + "learning_rate": 4.25392419157985e-07, + "loss": 0.4999, + "step": 11725 + }, + { + "epoch": 0.91, + "grad_norm": 1.2040753265022415, + "learning_rate": 4.2466765623716766e-07, + "loss": 0.5015, + "step": 11726 + }, + { + "epoch": 0.91, + "grad_norm": 1.3222846927141374, + "learning_rate": 4.2394349785648494e-07, + "loss": 0.5322, + "step": 11727 + }, + { + "epoch": 0.91, + "grad_norm": 1.2089381293848507, + "learning_rate": 4.232199440616536e-07, + "loss": 0.4804, + "step": 11728 + }, + { + "epoch": 0.91, + "grad_norm": 1.1606385740802105, + "learning_rate": 4.2249699489835815e-07, + "loss": 0.4857, + "step": 11729 + }, + { + "epoch": 0.91, + "grad_norm": 1.3053066589064044, + "learning_rate": 4.2177465041223995e-07, + "loss": 0.5312, + "step": 11730 + }, + { + "epoch": 0.91, + "grad_norm": 1.1614812612052423, + "learning_rate": 4.2105291064890474e-07, + "loss": 0.4595, + "step": 11731 + }, + { + "epoch": 0.91, + "grad_norm": 1.1995305951260657, + "learning_rate": 4.203317756539216e-07, + "loss": 0.5121, + "step": 11732 + }, + { + "epoch": 0.91, + "grad_norm": 1.2239049805179607, + "learning_rate": 4.196112454728185e-07, + "loss": 0.5229, + "step": 11733 + }, + { + "epoch": 0.91, + "grad_norm": 1.2171196915000573, + "learning_rate": 4.1889132015108467e-07, + "loss": 0.5001, + "step": 11734 + }, + { + "epoch": 0.91, + "grad_norm": 1.8535005751743459, + "learning_rate": 4.181719997341738e-07, + "loss": 0.4794, + "step": 11735 + }, + { + "epoch": 0.91, + "grad_norm": 1.2263811791392332, + "learning_rate": 4.1745328426749943e-07, + "loss": 0.5545, + "step": 11736 + }, + { + "epoch": 0.91, + "grad_norm": 1.2140691317618955, + "learning_rate": 4.167351737964409e-07, + "loss": 0.5407, + "step": 11737 + }, + { + "epoch": 0.91, + "grad_norm": 1.247686151899819, + "learning_rate": 4.1601766836633196e-07, + "loss": 0.5128, + "step": 11738 + }, + { + "epoch": 0.91, + "grad_norm": 1.1683219452183686, + "learning_rate": 4.153007680224752e-07, + "loss": 0.5228, + "step": 11739 + }, + { + "epoch": 0.91, + "grad_norm": 1.2837260906515053, + "learning_rate": 4.1458447281013113e-07, + "loss": 0.4502, + "step": 11740 + }, + { + "epoch": 0.91, + "grad_norm": 1.2071021593122753, + "learning_rate": 4.138687827745236e-07, + "loss": 0.4725, + "step": 11741 + }, + { + "epoch": 0.91, + "grad_norm": 1.2537044469710232, + "learning_rate": 4.131536979608386e-07, + "loss": 0.4991, + "step": 11742 + }, + { + "epoch": 0.91, + "grad_norm": 1.193695520486006, + "learning_rate": 4.124392184142223e-07, + "loss": 0.4982, + "step": 11743 + }, + { + "epoch": 0.91, + "grad_norm": 1.1998289959975694, + "learning_rate": 4.1172534417978305e-07, + "loss": 0.5043, + "step": 11744 + }, + { + "epoch": 0.91, + "grad_norm": 1.1520581424895415, + "learning_rate": 4.1101207530259144e-07, + "loss": 0.4442, + "step": 11745 + }, + { + "epoch": 0.91, + "grad_norm": 1.1643199763911125, + "learning_rate": 4.102994118276804e-07, + "loss": 0.5122, + "step": 11746 + }, + { + "epoch": 0.91, + "grad_norm": 1.1934336437833286, + "learning_rate": 4.095873538000439e-07, + "loss": 0.5159, + "step": 11747 + }, + { + "epoch": 0.91, + "grad_norm": 1.253856051857348, + "learning_rate": 4.088759012646382e-07, + "loss": 0.486, + "step": 11748 + }, + { + "epoch": 0.91, + "grad_norm": 1.1820389005485281, + "learning_rate": 4.081650542663795e-07, + "loss": 0.5113, + "step": 11749 + }, + { + "epoch": 0.91, + "grad_norm": 1.1781774797564402, + "learning_rate": 4.0745481285014876e-07, + "loss": 0.483, + "step": 11750 + }, + { + "epoch": 0.91, + "grad_norm": 1.0738288310448638, + "learning_rate": 4.0674517706078556e-07, + "loss": 0.4665, + "step": 11751 + }, + { + "epoch": 0.91, + "grad_norm": 1.2804562971290436, + "learning_rate": 4.060361469430962e-07, + "loss": 0.503, + "step": 11752 + }, + { + "epoch": 0.91, + "grad_norm": 1.2892818379919415, + "learning_rate": 4.0532772254184394e-07, + "loss": 0.5132, + "step": 11753 + }, + { + "epoch": 0.91, + "grad_norm": 1.2004887817252679, + "learning_rate": 4.0461990390175175e-07, + "loss": 0.4391, + "step": 11754 + }, + { + "epoch": 0.91, + "grad_norm": 1.2543431646833745, + "learning_rate": 4.0391269106751174e-07, + "loss": 0.5051, + "step": 11755 + }, + { + "epoch": 0.91, + "grad_norm": 1.2769473799526383, + "learning_rate": 4.032060840837726e-07, + "loss": 0.4994, + "step": 11756 + }, + { + "epoch": 0.91, + "grad_norm": 1.1998899985599114, + "learning_rate": 4.0250008299514755e-07, + "loss": 0.4979, + "step": 11757 + }, + { + "epoch": 0.91, + "grad_norm": 1.2316834766254283, + "learning_rate": 4.017946878462076e-07, + "loss": 0.4457, + "step": 11758 + }, + { + "epoch": 0.91, + "grad_norm": 1.1375048039932254, + "learning_rate": 4.010898986814893e-07, + "loss": 0.4964, + "step": 11759 + }, + { + "epoch": 0.91, + "grad_norm": 1.299131383842934, + "learning_rate": 4.0038571554548934e-07, + "loss": 0.5086, + "step": 11760 + }, + { + "epoch": 0.91, + "grad_norm": 1.1954543584282178, + "learning_rate": 3.9968213848266655e-07, + "loss": 0.4802, + "step": 11761 + }, + { + "epoch": 0.91, + "grad_norm": 1.2054921432865644, + "learning_rate": 3.9897916753744324e-07, + "loss": 0.5338, + "step": 11762 + }, + { + "epoch": 0.91, + "grad_norm": 1.2764553048407306, + "learning_rate": 3.9827680275419944e-07, + "loss": 0.5121, + "step": 11763 + }, + { + "epoch": 0.91, + "grad_norm": 1.2053758942694601, + "learning_rate": 3.9757504417727856e-07, + "loss": 0.4842, + "step": 11764 + }, + { + "epoch": 0.91, + "grad_norm": 1.1320032846854713, + "learning_rate": 3.9687389185098733e-07, + "loss": 0.4689, + "step": 11765 + }, + { + "epoch": 0.91, + "grad_norm": 1.214284639398115, + "learning_rate": 3.9617334581959267e-07, + "loss": 0.5429, + "step": 11766 + }, + { + "epoch": 0.91, + "grad_norm": 1.2946557969823145, + "learning_rate": 3.9547340612732356e-07, + "loss": 0.5099, + "step": 11767 + }, + { + "epoch": 0.91, + "grad_norm": 1.2584743298658398, + "learning_rate": 3.9477407281837246e-07, + "loss": 0.5073, + "step": 11768 + }, + { + "epoch": 0.91, + "grad_norm": 1.1653233810330084, + "learning_rate": 3.940753459368896e-07, + "loss": 0.4227, + "step": 11769 + }, + { + "epoch": 0.91, + "grad_norm": 1.2595806133826835, + "learning_rate": 3.9337722552698963e-07, + "loss": 0.5476, + "step": 11770 + }, + { + "epoch": 0.91, + "grad_norm": 1.2531475968832562, + "learning_rate": 3.9267971163274966e-07, + "loss": 0.5068, + "step": 11771 + }, + { + "epoch": 0.91, + "grad_norm": 1.0947538401381067, + "learning_rate": 3.919828042982077e-07, + "loss": 0.45, + "step": 11772 + }, + { + "epoch": 0.91, + "grad_norm": 1.2633416571296776, + "learning_rate": 3.9128650356736297e-07, + "loss": 0.4809, + "step": 11773 + }, + { + "epoch": 0.91, + "grad_norm": 1.237929333279952, + "learning_rate": 3.905908094841737e-07, + "loss": 0.5068, + "step": 11774 + }, + { + "epoch": 0.91, + "grad_norm": 1.1030452299588724, + "learning_rate": 3.898957220925648e-07, + "loss": 0.4438, + "step": 11775 + }, + { + "epoch": 0.91, + "grad_norm": 1.2417517803556468, + "learning_rate": 3.8920124143642104e-07, + "loss": 0.5079, + "step": 11776 + }, + { + "epoch": 0.91, + "grad_norm": 1.1804708444209424, + "learning_rate": 3.885073675595874e-07, + "loss": 0.4584, + "step": 11777 + }, + { + "epoch": 0.91, + "grad_norm": 1.0939406092630664, + "learning_rate": 3.878141005058733e-07, + "loss": 0.454, + "step": 11778 + }, + { + "epoch": 0.91, + "grad_norm": 1.1600049880348036, + "learning_rate": 3.871214403190471e-07, + "loss": 0.4961, + "step": 11779 + }, + { + "epoch": 0.91, + "grad_norm": 1.2780535297883957, + "learning_rate": 3.864293870428404e-07, + "loss": 0.5328, + "step": 11780 + }, + { + "epoch": 0.91, + "grad_norm": 1.2095696876606805, + "learning_rate": 3.8573794072094495e-07, + "loss": 0.5176, + "step": 11781 + }, + { + "epoch": 0.91, + "grad_norm": 1.2659270785544752, + "learning_rate": 3.8504710139701804e-07, + "loss": 0.5117, + "step": 11782 + }, + { + "epoch": 0.91, + "grad_norm": 1.319515461711487, + "learning_rate": 3.843568691146748e-07, + "loss": 0.5153, + "step": 11783 + }, + { + "epoch": 0.91, + "grad_norm": 1.2002694562539997, + "learning_rate": 3.8366724391749153e-07, + "loss": 0.5164, + "step": 11784 + }, + { + "epoch": 0.91, + "grad_norm": 1.2488768299903783, + "learning_rate": 3.829782258490078e-07, + "loss": 0.4832, + "step": 11785 + }, + { + "epoch": 0.91, + "grad_norm": 1.3546710126636907, + "learning_rate": 3.8228981495272654e-07, + "loss": 0.5357, + "step": 11786 + }, + { + "epoch": 0.91, + "grad_norm": 1.2989807039285735, + "learning_rate": 3.8160201127211083e-07, + "loss": 0.489, + "step": 11787 + }, + { + "epoch": 0.91, + "grad_norm": 1.1660209924008844, + "learning_rate": 3.809148148505848e-07, + "loss": 0.477, + "step": 11788 + }, + { + "epoch": 0.91, + "grad_norm": 1.2577406110418798, + "learning_rate": 3.8022822573153374e-07, + "loss": 0.4625, + "step": 11789 + }, + { + "epoch": 0.91, + "grad_norm": 1.103991866520623, + "learning_rate": 3.795422439583063e-07, + "loss": 0.4281, + "step": 11790 + }, + { + "epoch": 0.91, + "grad_norm": 1.1095122131596897, + "learning_rate": 3.788568695742123e-07, + "loss": 0.4694, + "step": 11791 + }, + { + "epoch": 0.91, + "grad_norm": 1.2214688028356104, + "learning_rate": 3.781721026225227e-07, + "loss": 0.5078, + "step": 11792 + }, + { + "epoch": 0.91, + "grad_norm": 1.2617199310202725, + "learning_rate": 3.7748794314647066e-07, + "loss": 0.5729, + "step": 11793 + }, + { + "epoch": 0.91, + "grad_norm": 1.2734457378472044, + "learning_rate": 3.768043911892505e-07, + "loss": 0.539, + "step": 11794 + }, + { + "epoch": 0.92, + "grad_norm": 1.40469694208657, + "learning_rate": 3.7612144679401664e-07, + "loss": 0.5234, + "step": 11795 + }, + { + "epoch": 0.92, + "grad_norm": 1.1789927679006709, + "learning_rate": 3.754391100038901e-07, + "loss": 0.5356, + "step": 11796 + }, + { + "epoch": 0.92, + "grad_norm": 1.2780748427419666, + "learning_rate": 3.747573808619476e-07, + "loss": 0.5241, + "step": 11797 + }, + { + "epoch": 0.92, + "grad_norm": 1.2062431651499914, + "learning_rate": 3.740762594112324e-07, + "loss": 0.4773, + "step": 11798 + }, + { + "epoch": 0.92, + "grad_norm": 1.195164340941025, + "learning_rate": 3.733957456947457e-07, + "loss": 0.5391, + "step": 11799 + }, + { + "epoch": 0.92, + "grad_norm": 1.1602979888131897, + "learning_rate": 3.72715839755452e-07, + "loss": 0.4759, + "step": 11800 + }, + { + "epoch": 0.92, + "grad_norm": 1.2576929355445683, + "learning_rate": 3.720365416362792e-07, + "loss": 0.5166, + "step": 11801 + }, + { + "epoch": 0.92, + "grad_norm": 1.248847239147885, + "learning_rate": 3.713578513801119e-07, + "loss": 0.5368, + "step": 11802 + }, + { + "epoch": 0.92, + "grad_norm": 1.2689911148145008, + "learning_rate": 3.706797690298014e-07, + "loss": 0.5151, + "step": 11803 + }, + { + "epoch": 0.92, + "grad_norm": 1.1390971049009015, + "learning_rate": 3.7000229462815785e-07, + "loss": 0.4449, + "step": 11804 + }, + { + "epoch": 0.92, + "grad_norm": 1.2716373287148128, + "learning_rate": 3.6932542821795256e-07, + "loss": 0.5376, + "step": 11805 + }, + { + "epoch": 0.92, + "grad_norm": 1.2276580194511737, + "learning_rate": 3.6864916984192143e-07, + "loss": 0.4925, + "step": 11806 + }, + { + "epoch": 0.92, + "grad_norm": 1.201898469819754, + "learning_rate": 3.6797351954275916e-07, + "loss": 0.4607, + "step": 11807 + }, + { + "epoch": 0.92, + "grad_norm": 1.2896428044687338, + "learning_rate": 3.6729847736312387e-07, + "loss": 0.5385, + "step": 11808 + }, + { + "epoch": 0.92, + "grad_norm": 1.2846397127112779, + "learning_rate": 3.6662404334563363e-07, + "loss": 0.5583, + "step": 11809 + }, + { + "epoch": 0.92, + "grad_norm": 1.2214329361339653, + "learning_rate": 3.6595021753286886e-07, + "loss": 0.4906, + "step": 11810 + }, + { + "epoch": 0.92, + "grad_norm": 1.2725794423005357, + "learning_rate": 3.652769999673733e-07, + "loss": 0.5469, + "step": 11811 + }, + { + "epoch": 0.92, + "grad_norm": 1.2037241546137951, + "learning_rate": 3.646043906916474e-07, + "loss": 0.5198, + "step": 11812 + }, + { + "epoch": 0.92, + "grad_norm": 1.360362297284121, + "learning_rate": 3.639323897481606e-07, + "loss": 0.5368, + "step": 11813 + }, + { + "epoch": 0.92, + "grad_norm": 1.2828800717955788, + "learning_rate": 3.632609971793366e-07, + "loss": 0.4928, + "step": 11814 + }, + { + "epoch": 0.92, + "grad_norm": 1.11524998107787, + "learning_rate": 3.6259021302756383e-07, + "loss": 0.4862, + "step": 11815 + }, + { + "epoch": 0.92, + "grad_norm": 1.1956216250350546, + "learning_rate": 3.619200373351939e-07, + "loss": 0.4736, + "step": 11816 + }, + { + "epoch": 0.92, + "grad_norm": 1.2944798240047037, + "learning_rate": 3.612504701445385e-07, + "loss": 0.4855, + "step": 11817 + }, + { + "epoch": 0.92, + "grad_norm": 1.192429400259517, + "learning_rate": 3.6058151149787166e-07, + "loss": 0.5089, + "step": 11818 + }, + { + "epoch": 0.92, + "grad_norm": 1.350053683732373, + "learning_rate": 3.5991316143742515e-07, + "loss": 0.4956, + "step": 11819 + }, + { + "epoch": 0.92, + "grad_norm": 1.2975610504046746, + "learning_rate": 3.592454200053963e-07, + "loss": 0.5721, + "step": 11820 + }, + { + "epoch": 0.92, + "grad_norm": 1.2104513853853138, + "learning_rate": 3.585782872439458e-07, + "loss": 0.5139, + "step": 11821 + }, + { + "epoch": 0.92, + "grad_norm": 1.318205964381494, + "learning_rate": 3.5791176319519006e-07, + "loss": 0.5689, + "step": 11822 + }, + { + "epoch": 0.92, + "grad_norm": 1.205212108020372, + "learning_rate": 3.5724584790121084e-07, + "loss": 0.4791, + "step": 11823 + }, + { + "epoch": 0.92, + "grad_norm": 1.2967040627210051, + "learning_rate": 3.565805414040535e-07, + "loss": 0.5225, + "step": 11824 + }, + { + "epoch": 0.92, + "grad_norm": 1.326910742890564, + "learning_rate": 3.5591584374571773e-07, + "loss": 0.5302, + "step": 11825 + }, + { + "epoch": 0.92, + "grad_norm": 1.1723913962326695, + "learning_rate": 3.5525175496817223e-07, + "loss": 0.4868, + "step": 11826 + }, + { + "epoch": 0.92, + "grad_norm": 1.1615463303330051, + "learning_rate": 3.545882751133445e-07, + "loss": 0.502, + "step": 11827 + }, + { + "epoch": 0.92, + "grad_norm": 1.225542652839979, + "learning_rate": 3.5392540422312213e-07, + "loss": 0.4472, + "step": 11828 + }, + { + "epoch": 0.92, + "grad_norm": 1.1211358570870824, + "learning_rate": 3.5326314233935734e-07, + "loss": 0.5319, + "step": 11829 + }, + { + "epoch": 0.92, + "grad_norm": 1.2231918478092207, + "learning_rate": 3.5260148950385985e-07, + "loss": 0.4812, + "step": 11830 + }, + { + "epoch": 0.92, + "grad_norm": 1.208629944449851, + "learning_rate": 3.5194044575840523e-07, + "loss": 0.4487, + "step": 11831 + }, + { + "epoch": 0.92, + "grad_norm": 1.052311686796946, + "learning_rate": 3.5128001114472674e-07, + "loss": 0.4481, + "step": 11832 + }, + { + "epoch": 0.92, + "grad_norm": 1.2427054710813539, + "learning_rate": 3.506201857045222e-07, + "loss": 0.4825, + "step": 11833 + }, + { + "epoch": 0.92, + "grad_norm": 1.1732240668177534, + "learning_rate": 3.4996096947945036e-07, + "loss": 0.4688, + "step": 11834 + }, + { + "epoch": 0.92, + "grad_norm": 1.25371243413933, + "learning_rate": 3.4930236251112914e-07, + "loss": 0.4958, + "step": 11835 + }, + { + "epoch": 0.92, + "grad_norm": 1.1516007436460405, + "learning_rate": 3.4864436484114086e-07, + "loss": 0.4779, + "step": 11836 + }, + { + "epoch": 0.92, + "grad_norm": 1.2052656179324965, + "learning_rate": 3.4798697651102887e-07, + "loss": 0.5564, + "step": 11837 + }, + { + "epoch": 0.92, + "grad_norm": 1.2081124608099083, + "learning_rate": 3.4733019756229557e-07, + "loss": 0.4667, + "step": 11838 + }, + { + "epoch": 0.92, + "grad_norm": 1.194664025022433, + "learning_rate": 3.4667402803641005e-07, + "loss": 0.4947, + "step": 11839 + }, + { + "epoch": 0.92, + "grad_norm": 1.0570926510274017, + "learning_rate": 3.460184679747969e-07, + "loss": 0.4169, + "step": 11840 + }, + { + "epoch": 0.92, + "grad_norm": 1.119244316050855, + "learning_rate": 3.453635174188463e-07, + "loss": 0.4697, + "step": 11841 + }, + { + "epoch": 0.92, + "grad_norm": 1.2132183129861274, + "learning_rate": 3.447091764099075e-07, + "loss": 0.4992, + "step": 11842 + }, + { + "epoch": 0.92, + "grad_norm": 1.2914518003282607, + "learning_rate": 3.440554449892941e-07, + "loss": 0.5004, + "step": 11843 + }, + { + "epoch": 0.92, + "grad_norm": 1.1518716653329535, + "learning_rate": 3.434023231982786e-07, + "loss": 0.4562, + "step": 11844 + }, + { + "epoch": 0.92, + "grad_norm": 1.2147686312593688, + "learning_rate": 3.4274981107809466e-07, + "loss": 0.4195, + "step": 11845 + }, + { + "epoch": 0.92, + "grad_norm": 1.1839667509926344, + "learning_rate": 3.4209790866994055e-07, + "loss": 0.4858, + "step": 11846 + }, + { + "epoch": 0.92, + "grad_norm": 1.171469554699259, + "learning_rate": 3.414466160149732e-07, + "loss": 0.4887, + "step": 11847 + }, + { + "epoch": 0.92, + "grad_norm": 1.210771660062191, + "learning_rate": 3.4079593315431315e-07, + "loss": 0.4957, + "step": 11848 + }, + { + "epoch": 0.92, + "grad_norm": 1.2522919147351381, + "learning_rate": 3.401458601290408e-07, + "loss": 0.534, + "step": 11849 + }, + { + "epoch": 0.92, + "grad_norm": 1.2936390409939664, + "learning_rate": 3.3949639698019896e-07, + "loss": 0.5641, + "step": 11850 + }, + { + "epoch": 0.92, + "grad_norm": 1.369830731703924, + "learning_rate": 3.388475437487915e-07, + "loss": 0.5514, + "step": 11851 + }, + { + "epoch": 0.92, + "grad_norm": 1.297526230519369, + "learning_rate": 3.381993004757822e-07, + "loss": 0.5662, + "step": 11852 + }, + { + "epoch": 0.92, + "grad_norm": 1.2452877870259023, + "learning_rate": 3.3755166720210065e-07, + "loss": 0.5143, + "step": 11853 + }, + { + "epoch": 0.92, + "grad_norm": 1.1829554785076084, + "learning_rate": 3.3690464396863407e-07, + "loss": 0.5026, + "step": 11854 + }, + { + "epoch": 0.92, + "grad_norm": 1.1929361994711225, + "learning_rate": 3.3625823081623097e-07, + "loss": 0.4908, + "step": 11855 + }, + { + "epoch": 0.92, + "grad_norm": 1.153294213344485, + "learning_rate": 3.3561242778570426e-07, + "loss": 0.5194, + "step": 11856 + }, + { + "epoch": 0.92, + "grad_norm": 1.2180884106123477, + "learning_rate": 3.349672349178279e-07, + "loss": 0.466, + "step": 11857 + }, + { + "epoch": 0.92, + "grad_norm": 1.2845498367447854, + "learning_rate": 3.343226522533338e-07, + "loss": 0.558, + "step": 11858 + }, + { + "epoch": 0.92, + "grad_norm": 1.172581777425249, + "learning_rate": 3.3367867983292056e-07, + "loss": 0.4776, + "step": 11859 + }, + { + "epoch": 0.92, + "grad_norm": 1.239165414622704, + "learning_rate": 3.330353176972423e-07, + "loss": 0.47, + "step": 11860 + }, + { + "epoch": 0.92, + "grad_norm": 1.1473822384082426, + "learning_rate": 3.323925658869209e-07, + "loss": 0.4775, + "step": 11861 + }, + { + "epoch": 0.92, + "grad_norm": 1.1340337024437142, + "learning_rate": 3.3175042444253405e-07, + "loss": 0.4497, + "step": 11862 + }, + { + "epoch": 0.92, + "grad_norm": 1.2249561652300989, + "learning_rate": 3.311088934046247e-07, + "loss": 0.5151, + "step": 11863 + }, + { + "epoch": 0.92, + "grad_norm": 1.1540089617700202, + "learning_rate": 3.3046797281369614e-07, + "loss": 0.4877, + "step": 11864 + }, + { + "epoch": 0.92, + "grad_norm": 1.2289523999375467, + "learning_rate": 3.298276627102126e-07, + "loss": 0.4859, + "step": 11865 + }, + { + "epoch": 0.92, + "grad_norm": 1.187750940411227, + "learning_rate": 3.2918796313459954e-07, + "loss": 0.512, + "step": 11866 + }, + { + "epoch": 0.92, + "grad_norm": 1.19838081434691, + "learning_rate": 3.285488741272458e-07, + "loss": 0.4784, + "step": 11867 + }, + { + "epoch": 0.92, + "grad_norm": 1.2647895881502962, + "learning_rate": 3.2791039572849903e-07, + "loss": 0.4906, + "step": 11868 + }, + { + "epoch": 0.92, + "grad_norm": 1.2120988260806527, + "learning_rate": 3.272725279786715e-07, + "loss": 0.5222, + "step": 11869 + }, + { + "epoch": 0.92, + "grad_norm": 1.1655362931520417, + "learning_rate": 3.2663527091803317e-07, + "loss": 0.4923, + "step": 11870 + }, + { + "epoch": 0.92, + "grad_norm": 1.300798891423282, + "learning_rate": 3.2599862458681963e-07, + "loss": 0.5189, + "step": 11871 + }, + { + "epoch": 0.92, + "grad_norm": 1.2424211582214633, + "learning_rate": 3.2536258902522323e-07, + "loss": 0.507, + "step": 11872 + }, + { + "epoch": 0.92, + "grad_norm": 1.2824541341748668, + "learning_rate": 3.247271642734007e-07, + "loss": 0.4833, + "step": 11873 + }, + { + "epoch": 0.92, + "grad_norm": 1.2379879769648892, + "learning_rate": 3.24092350371471e-07, + "loss": 0.5265, + "step": 11874 + }, + { + "epoch": 0.92, + "grad_norm": 1.242445577047439, + "learning_rate": 3.234581473595122e-07, + "loss": 0.4934, + "step": 11875 + }, + { + "epoch": 0.92, + "grad_norm": 1.2357918056645432, + "learning_rate": 3.228245552775633e-07, + "loss": 0.4952, + "step": 11876 + }, + { + "epoch": 0.92, + "grad_norm": 1.1593829316641642, + "learning_rate": 3.22191574165629e-07, + "loss": 0.5036, + "step": 11877 + }, + { + "epoch": 0.92, + "grad_norm": 1.242463950818148, + "learning_rate": 3.215592040636717e-07, + "loss": 0.5641, + "step": 11878 + }, + { + "epoch": 0.92, + "grad_norm": 1.2007545304412486, + "learning_rate": 3.209274450116162e-07, + "loss": 0.5184, + "step": 11879 + }, + { + "epoch": 0.92, + "grad_norm": 1.2131656451641326, + "learning_rate": 3.202962970493484e-07, + "loss": 0.5185, + "step": 11880 + }, + { + "epoch": 0.92, + "grad_norm": 1.2582918284478506, + "learning_rate": 3.196657602167175e-07, + "loss": 0.4557, + "step": 11881 + }, + { + "epoch": 0.92, + "grad_norm": 1.2227858209660794, + "learning_rate": 3.1903583455352937e-07, + "loss": 0.5374, + "step": 11882 + }, + { + "epoch": 0.92, + "grad_norm": 1.168920531046059, + "learning_rate": 3.1840652009955563e-07, + "loss": 0.4883, + "step": 11883 + }, + { + "epoch": 0.92, + "grad_norm": 1.3303419571299668, + "learning_rate": 3.1777781689453e-07, + "loss": 0.5477, + "step": 11884 + }, + { + "epoch": 0.92, + "grad_norm": 1.2804068140175722, + "learning_rate": 3.1714972497814413e-07, + "loss": 0.5048, + "step": 11885 + }, + { + "epoch": 0.92, + "grad_norm": 1.1467945517615132, + "learning_rate": 3.1652224439005287e-07, + "loss": 0.4773, + "step": 11886 + }, + { + "epoch": 0.92, + "grad_norm": 1.174258846529829, + "learning_rate": 3.158953751698723e-07, + "loss": 0.4696, + "step": 11887 + }, + { + "epoch": 0.92, + "grad_norm": 1.260874510069792, + "learning_rate": 3.152691173571809e-07, + "loss": 0.5119, + "step": 11888 + }, + { + "epoch": 0.92, + "grad_norm": 1.139834299760827, + "learning_rate": 3.146434709915158e-07, + "loss": 0.481, + "step": 11889 + }, + { + "epoch": 0.92, + "grad_norm": 1.1358686591569918, + "learning_rate": 3.1401843611237993e-07, + "loss": 0.477, + "step": 11890 + }, + { + "epoch": 0.92, + "grad_norm": 1.268411042359718, + "learning_rate": 3.1339401275923277e-07, + "loss": 0.4924, + "step": 11891 + }, + { + "epoch": 0.92, + "grad_norm": 1.2053667461678765, + "learning_rate": 3.127702009714961e-07, + "loss": 0.49, + "step": 11892 + }, + { + "epoch": 0.92, + "grad_norm": 1.159756524168175, + "learning_rate": 3.121470007885574e-07, + "loss": 0.4644, + "step": 11893 + }, + { + "epoch": 0.92, + "grad_norm": 1.2298578108035971, + "learning_rate": 3.1152441224976073e-07, + "loss": 0.4833, + "step": 11894 + }, + { + "epoch": 0.92, + "grad_norm": 1.2799199807218755, + "learning_rate": 3.1090243539441565e-07, + "loss": 0.5611, + "step": 11895 + }, + { + "epoch": 0.92, + "grad_norm": 1.31700301863861, + "learning_rate": 3.1028107026178756e-07, + "loss": 0.5299, + "step": 11896 + }, + { + "epoch": 0.92, + "grad_norm": 1.1041180881874268, + "learning_rate": 3.096603168911072e-07, + "loss": 0.4529, + "step": 11897 + }, + { + "epoch": 0.92, + "grad_norm": 1.2636458852651218, + "learning_rate": 3.090401753215677e-07, + "loss": 0.5014, + "step": 11898 + }, + { + "epoch": 0.92, + "grad_norm": 1.3141831096587264, + "learning_rate": 3.084206455923211e-07, + "loss": 0.5246, + "step": 11899 + }, + { + "epoch": 0.92, + "grad_norm": 1.1742321467951986, + "learning_rate": 3.078017277424805e-07, + "loss": 0.5124, + "step": 11900 + }, + { + "epoch": 0.92, + "grad_norm": 1.2465338811603952, + "learning_rate": 3.071834218111225e-07, + "loss": 0.5203, + "step": 11901 + }, + { + "epoch": 0.92, + "grad_norm": 1.2592303891134542, + "learning_rate": 3.0656572783728247e-07, + "loss": 0.5941, + "step": 11902 + }, + { + "epoch": 0.92, + "grad_norm": 1.1550183308785742, + "learning_rate": 3.059486458599592e-07, + "loss": 0.4972, + "step": 11903 + }, + { + "epoch": 0.92, + "grad_norm": 1.3015029325741627, + "learning_rate": 3.0533217591811383e-07, + "loss": 0.5139, + "step": 11904 + }, + { + "epoch": 0.92, + "grad_norm": 1.267953261717419, + "learning_rate": 3.0471631805066626e-07, + "loss": 0.4956, + "step": 11905 + }, + { + "epoch": 0.92, + "grad_norm": 1.265810729162535, + "learning_rate": 3.0410107229649764e-07, + "loss": 0.4983, + "step": 11906 + }, + { + "epoch": 0.92, + "grad_norm": 1.2888394711921056, + "learning_rate": 3.034864386944525e-07, + "loss": 0.5086, + "step": 11907 + }, + { + "epoch": 0.92, + "grad_norm": 1.255999425313903, + "learning_rate": 3.028724172833364e-07, + "loss": 0.5289, + "step": 11908 + }, + { + "epoch": 0.92, + "grad_norm": 1.2235600844900756, + "learning_rate": 3.022590081019161e-07, + "loss": 0.5266, + "step": 11909 + }, + { + "epoch": 0.92, + "grad_norm": 1.3734475388230556, + "learning_rate": 3.0164621118891733e-07, + "loss": 0.4774, + "step": 11910 + }, + { + "epoch": 0.92, + "grad_norm": 1.1895065420900075, + "learning_rate": 3.010340265830314e-07, + "loss": 0.5026, + "step": 11911 + }, + { + "epoch": 0.92, + "grad_norm": 1.233633180631941, + "learning_rate": 3.0042245432290616e-07, + "loss": 0.5101, + "step": 11912 + }, + { + "epoch": 0.92, + "grad_norm": 1.2901873680188227, + "learning_rate": 2.998114944471542e-07, + "loss": 0.5005, + "step": 11913 + }, + { + "epoch": 0.92, + "grad_norm": 1.232769948710911, + "learning_rate": 2.992011469943501e-07, + "loss": 0.518, + "step": 11914 + }, + { + "epoch": 0.92, + "grad_norm": 1.1740572637502646, + "learning_rate": 2.985914120030275e-07, + "loss": 0.4807, + "step": 11915 + }, + { + "epoch": 0.92, + "grad_norm": 1.1624653247050696, + "learning_rate": 2.979822895116802e-07, + "loss": 0.465, + "step": 11916 + }, + { + "epoch": 0.92, + "grad_norm": 1.1991851503716289, + "learning_rate": 2.973737795587672e-07, + "loss": 0.4782, + "step": 11917 + }, + { + "epoch": 0.92, + "grad_norm": 1.0942294568544784, + "learning_rate": 2.967658821827069e-07, + "loss": 0.4484, + "step": 11918 + }, + { + "epoch": 0.92, + "grad_norm": 1.2111436237750206, + "learning_rate": 2.9615859742187944e-07, + "loss": 0.5323, + "step": 11919 + }, + { + "epoch": 0.92, + "grad_norm": 1.4270931531285191, + "learning_rate": 2.955519253146233e-07, + "loss": 0.5519, + "step": 11920 + }, + { + "epoch": 0.92, + "grad_norm": 1.215772610996343, + "learning_rate": 2.9494586589924434e-07, + "loss": 0.5311, + "step": 11921 + }, + { + "epoch": 0.92, + "grad_norm": 1.2044263960002966, + "learning_rate": 2.9434041921400204e-07, + "loss": 0.5183, + "step": 11922 + }, + { + "epoch": 0.92, + "grad_norm": 1.1510258784297343, + "learning_rate": 2.9373558529712466e-07, + "loss": 0.4968, + "step": 11923 + }, + { + "epoch": 0.93, + "grad_norm": 1.268845921916506, + "learning_rate": 2.9313136418679835e-07, + "loss": 0.4591, + "step": 11924 + }, + { + "epoch": 0.93, + "grad_norm": 1.1582245848856378, + "learning_rate": 2.9252775592116924e-07, + "loss": 0.4784, + "step": 11925 + }, + { + "epoch": 0.93, + "grad_norm": 1.2377338342629929, + "learning_rate": 2.919247605383468e-07, + "loss": 0.5133, + "step": 11926 + }, + { + "epoch": 0.93, + "grad_norm": 1.3280985212491447, + "learning_rate": 2.913223780764007e-07, + "loss": 0.5569, + "step": 11927 + }, + { + "epoch": 0.93, + "grad_norm": 1.0928988278254506, + "learning_rate": 2.9072060857336384e-07, + "loss": 0.4218, + "step": 11928 + }, + { + "epoch": 0.93, + "grad_norm": 1.110030558139715, + "learning_rate": 2.9011945206722904e-07, + "loss": 0.4884, + "step": 11929 + }, + { + "epoch": 0.93, + "grad_norm": 1.134434347990986, + "learning_rate": 2.895189085959482e-07, + "loss": 0.4737, + "step": 11930 + }, + { + "epoch": 0.93, + "grad_norm": 1.1509882308961452, + "learning_rate": 2.8891897819743996e-07, + "loss": 0.4277, + "step": 11931 + }, + { + "epoch": 0.93, + "grad_norm": 1.2279753596259868, + "learning_rate": 2.883196609095773e-07, + "loss": 0.5068, + "step": 11932 + }, + { + "epoch": 0.93, + "grad_norm": 1.1836579056779666, + "learning_rate": 2.877209567702011e-07, + "loss": 0.5076, + "step": 11933 + }, + { + "epoch": 0.93, + "grad_norm": 1.2130509668289684, + "learning_rate": 2.871228658171088e-07, + "loss": 0.4813, + "step": 11934 + }, + { + "epoch": 0.93, + "grad_norm": 1.2656736482289208, + "learning_rate": 2.8652538808806253e-07, + "loss": 0.5475, + "step": 11935 + }, + { + "epoch": 0.93, + "grad_norm": 1.1824683954963262, + "learning_rate": 2.8592852362078315e-07, + "loss": 0.5113, + "step": 11936 + }, + { + "epoch": 0.93, + "grad_norm": 1.1837851995856066, + "learning_rate": 2.85332272452955e-07, + "loss": 0.484, + "step": 11937 + }, + { + "epoch": 0.93, + "grad_norm": 1.1781996382082538, + "learning_rate": 2.8473663462222025e-07, + "loss": 0.5109, + "step": 11938 + }, + { + "epoch": 0.93, + "grad_norm": 1.161489369386853, + "learning_rate": 2.8414161016618757e-07, + "loss": 0.474, + "step": 11939 + }, + { + "epoch": 0.93, + "grad_norm": 1.19865061072465, + "learning_rate": 2.8354719912242037e-07, + "loss": 0.4294, + "step": 11940 + }, + { + "epoch": 0.93, + "grad_norm": 1.2778090349926128, + "learning_rate": 2.8295340152845076e-07, + "loss": 0.5432, + "step": 11941 + }, + { + "epoch": 0.93, + "grad_norm": 1.2800668510473698, + "learning_rate": 2.823602174217643e-07, + "loss": 0.5377, + "step": 11942 + }, + { + "epoch": 0.93, + "grad_norm": 1.340754519166004, + "learning_rate": 2.817676468398145e-07, + "loss": 0.5583, + "step": 11943 + }, + { + "epoch": 0.93, + "grad_norm": 1.2766522043336932, + "learning_rate": 2.811756898200124e-07, + "loss": 0.5518, + "step": 11944 + }, + { + "epoch": 0.93, + "grad_norm": 1.1960889002666162, + "learning_rate": 2.8058434639973155e-07, + "loss": 0.4657, + "step": 11945 + }, + { + "epoch": 0.93, + "grad_norm": 1.2276840427821718, + "learning_rate": 2.799936166163075e-07, + "loss": 0.4676, + "step": 11946 + }, + { + "epoch": 0.93, + "grad_norm": 1.1255634804053662, + "learning_rate": 2.79403500507035e-07, + "loss": 0.4804, + "step": 11947 + }, + { + "epoch": 0.93, + "grad_norm": 1.1909780264299954, + "learning_rate": 2.788139981091698e-07, + "loss": 0.4561, + "step": 11948 + }, + { + "epoch": 0.93, + "grad_norm": 1.1905603626557855, + "learning_rate": 2.782251094599331e-07, + "loss": 0.4857, + "step": 11949 + }, + { + "epoch": 0.93, + "grad_norm": 1.1718303926243183, + "learning_rate": 2.7763683459650193e-07, + "loss": 0.5005, + "step": 11950 + }, + { + "epoch": 0.93, + "grad_norm": 1.2764484872879782, + "learning_rate": 2.770491735560199e-07, + "loss": 0.4717, + "step": 11951 + }, + { + "epoch": 0.93, + "grad_norm": 1.1982047798765632, + "learning_rate": 2.764621263755862e-07, + "loss": 0.5143, + "step": 11952 + }, + { + "epoch": 0.93, + "grad_norm": 1.194503211022297, + "learning_rate": 2.758756930922646e-07, + "loss": 0.4927, + "step": 11953 + }, + { + "epoch": 0.93, + "grad_norm": 1.1853288578882966, + "learning_rate": 2.752898737430809e-07, + "loss": 0.5167, + "step": 11954 + }, + { + "epoch": 0.93, + "grad_norm": 1.1788184394264087, + "learning_rate": 2.747046683650201e-07, + "loss": 0.4516, + "step": 11955 + }, + { + "epoch": 0.93, + "grad_norm": 1.307880856494758, + "learning_rate": 2.741200769950303e-07, + "loss": 0.525, + "step": 11956 + }, + { + "epoch": 0.93, + "grad_norm": 1.3012016922836251, + "learning_rate": 2.735360996700187e-07, + "loss": 0.5012, + "step": 11957 + }, + { + "epoch": 0.93, + "grad_norm": 1.178059952050663, + "learning_rate": 2.7295273642685473e-07, + "loss": 0.5073, + "step": 11958 + }, + { + "epoch": 0.93, + "grad_norm": 1.2309126764618947, + "learning_rate": 2.7236998730236996e-07, + "loss": 0.5447, + "step": 11959 + }, + { + "epoch": 0.93, + "grad_norm": 1.1406729178618724, + "learning_rate": 2.71787852333355e-07, + "loss": 0.4653, + "step": 11960 + }, + { + "epoch": 0.93, + "grad_norm": 1.1757647744478734, + "learning_rate": 2.7120633155656606e-07, + "loss": 0.4984, + "step": 11961 + }, + { + "epoch": 0.93, + "grad_norm": 1.185925845766163, + "learning_rate": 2.706254250087126e-07, + "loss": 0.4842, + "step": 11962 + }, + { + "epoch": 0.93, + "grad_norm": 1.2629092714518357, + "learning_rate": 2.7004513272647415e-07, + "loss": 0.4513, + "step": 11963 + }, + { + "epoch": 0.93, + "grad_norm": 1.3132943973643378, + "learning_rate": 2.69465454746487e-07, + "loss": 0.5086, + "step": 11964 + }, + { + "epoch": 0.93, + "grad_norm": 1.3004609134524585, + "learning_rate": 2.688863911053474e-07, + "loss": 0.5084, + "step": 11965 + }, + { + "epoch": 0.93, + "grad_norm": 1.2063437667533226, + "learning_rate": 2.683079418396173e-07, + "loss": 0.5034, + "step": 11966 + }, + { + "epoch": 0.93, + "grad_norm": 1.208598727097031, + "learning_rate": 2.6773010698581516e-07, + "loss": 0.5736, + "step": 11967 + }, + { + "epoch": 0.93, + "grad_norm": 1.1611731083505412, + "learning_rate": 2.671528865804229e-07, + "loss": 0.4801, + "step": 11968 + }, + { + "epoch": 0.93, + "grad_norm": 1.170193903252669, + "learning_rate": 2.6657628065988483e-07, + "loss": 0.4723, + "step": 11969 + }, + { + "epoch": 0.93, + "grad_norm": 1.087732930285538, + "learning_rate": 2.6600028926060283e-07, + "loss": 0.4972, + "step": 11970 + }, + { + "epoch": 0.93, + "grad_norm": 1.1423176786494544, + "learning_rate": 2.6542491241894454e-07, + "loss": 0.4859, + "step": 11971 + }, + { + "epoch": 0.93, + "grad_norm": 1.155597451079196, + "learning_rate": 2.648501501712342e-07, + "loss": 0.506, + "step": 11972 + }, + { + "epoch": 0.93, + "grad_norm": 1.2153282058183568, + "learning_rate": 2.6427600255376164e-07, + "loss": 0.4823, + "step": 11973 + }, + { + "epoch": 0.93, + "grad_norm": 1.1563043839320493, + "learning_rate": 2.6370246960277344e-07, + "loss": 0.4881, + "step": 11974 + }, + { + "epoch": 0.93, + "grad_norm": 1.167189117798201, + "learning_rate": 2.631295513544818e-07, + "loss": 0.4691, + "step": 11975 + }, + { + "epoch": 0.93, + "grad_norm": 1.1438665445286416, + "learning_rate": 2.625572478450578e-07, + "loss": 0.4951, + "step": 11976 + }, + { + "epoch": 0.93, + "grad_norm": 1.3229689650323093, + "learning_rate": 2.619855591106324e-07, + "loss": 0.546, + "step": 11977 + }, + { + "epoch": 0.93, + "grad_norm": 1.215251694656694, + "learning_rate": 2.614144851873002e-07, + "loss": 0.4954, + "step": 11978 + }, + { + "epoch": 0.93, + "grad_norm": 1.274239550806064, + "learning_rate": 2.608440261111178e-07, + "loss": 0.5024, + "step": 11979 + }, + { + "epoch": 0.93, + "grad_norm": 1.273374076153158, + "learning_rate": 2.602741819180976e-07, + "loss": 0.5202, + "step": 11980 + }, + { + "epoch": 0.93, + "grad_norm": 1.1657967673550882, + "learning_rate": 2.597049526442197e-07, + "loss": 0.5017, + "step": 11981 + }, + { + "epoch": 0.93, + "grad_norm": 1.2373461637342138, + "learning_rate": 2.5913633832542083e-07, + "loss": 0.5082, + "step": 11982 + }, + { + "epoch": 0.93, + "grad_norm": 1.1556111710310504, + "learning_rate": 2.5856833899760123e-07, + "loss": 0.4943, + "step": 11983 + }, + { + "epoch": 0.93, + "grad_norm": 1.176614307818194, + "learning_rate": 2.580009546966211e-07, + "loss": 0.4251, + "step": 11984 + }, + { + "epoch": 0.93, + "grad_norm": 1.2693501504694302, + "learning_rate": 2.574341854583029e-07, + "loss": 0.5186, + "step": 11985 + }, + { + "epoch": 0.93, + "grad_norm": 1.1877954015226257, + "learning_rate": 2.568680313184302e-07, + "loss": 0.5017, + "step": 11986 + }, + { + "epoch": 0.93, + "grad_norm": 1.2849889025969692, + "learning_rate": 2.563024923127477e-07, + "loss": 0.5308, + "step": 11987 + }, + { + "epoch": 0.93, + "grad_norm": 1.23379657568553, + "learning_rate": 2.5573756847695806e-07, + "loss": 0.506, + "step": 11988 + }, + { + "epoch": 0.93, + "grad_norm": 1.1914391685065533, + "learning_rate": 2.551732598467305e-07, + "loss": 0.5038, + "step": 11989 + }, + { + "epoch": 0.93, + "grad_norm": 1.1735511998993036, + "learning_rate": 2.5460956645769085e-07, + "loss": 0.5073, + "step": 11990 + }, + { + "epoch": 0.93, + "grad_norm": 1.0828736809514763, + "learning_rate": 2.5404648834542855e-07, + "loss": 0.4625, + "step": 11991 + }, + { + "epoch": 0.93, + "grad_norm": 1.2085040837804049, + "learning_rate": 2.534840255454962e-07, + "loss": 0.5056, + "step": 11992 + }, + { + "epoch": 0.93, + "grad_norm": 1.2284209634914356, + "learning_rate": 2.529221780933999e-07, + "loss": 0.497, + "step": 11993 + }, + { + "epoch": 0.93, + "grad_norm": 1.234403972044294, + "learning_rate": 2.523609460246168e-07, + "loss": 0.518, + "step": 11994 + }, + { + "epoch": 0.93, + "grad_norm": 1.0907697674439658, + "learning_rate": 2.5180032937457744e-07, + "loss": 0.4483, + "step": 11995 + }, + { + "epoch": 0.93, + "grad_norm": 1.2134257685555707, + "learning_rate": 2.512403281786768e-07, + "loss": 0.4758, + "step": 11996 + }, + { + "epoch": 0.93, + "grad_norm": 1.1679209313443497, + "learning_rate": 2.5068094247227227e-07, + "loss": 0.4917, + "step": 11997 + }, + { + "epoch": 0.93, + "grad_norm": 1.2417213956987827, + "learning_rate": 2.501221722906799e-07, + "loss": 0.5062, + "step": 11998 + }, + { + "epoch": 0.93, + "grad_norm": 1.1856591864339103, + "learning_rate": 2.4956401766917713e-07, + "loss": 0.4469, + "step": 11999 + }, + { + "epoch": 0.93, + "grad_norm": 1.2145081574789132, + "learning_rate": 2.4900647864300353e-07, + "loss": 0.5042, + "step": 12000 + }, + { + "epoch": 0.93, + "grad_norm": 1.190122719695915, + "learning_rate": 2.4844955524735983e-07, + "loss": 0.5048, + "step": 12001 + }, + { + "epoch": 0.93, + "grad_norm": 1.1787707575146524, + "learning_rate": 2.4789324751740674e-07, + "loss": 0.4698, + "step": 12002 + }, + { + "epoch": 0.93, + "grad_norm": 1.2061319303294689, + "learning_rate": 2.4733755548826734e-07, + "loss": 0.4878, + "step": 12003 + }, + { + "epoch": 0.93, + "grad_norm": 1.2186741927608036, + "learning_rate": 2.467824791950246e-07, + "loss": 0.4798, + "step": 12004 + }, + { + "epoch": 0.93, + "grad_norm": 1.2429860745031263, + "learning_rate": 2.4622801867272395e-07, + "loss": 0.5458, + "step": 12005 + }, + { + "epoch": 0.93, + "grad_norm": 1.21187530627746, + "learning_rate": 2.456741739563717e-07, + "loss": 0.4979, + "step": 12006 + }, + { + "epoch": 0.93, + "grad_norm": 1.2818807584502108, + "learning_rate": 2.4512094508093553e-07, + "loss": 0.512, + "step": 12007 + }, + { + "epoch": 0.93, + "grad_norm": 1.3008086513806378, + "learning_rate": 2.445683320813408e-07, + "loss": 0.505, + "step": 12008 + }, + { + "epoch": 0.93, + "grad_norm": 1.165823097883934, + "learning_rate": 2.4401633499248065e-07, + "loss": 0.5171, + "step": 12009 + }, + { + "epoch": 0.93, + "grad_norm": 1.2662404765517519, + "learning_rate": 2.434649538492018e-07, + "loss": 0.5399, + "step": 12010 + }, + { + "epoch": 0.93, + "grad_norm": 1.1070861845849567, + "learning_rate": 2.4291418868631845e-07, + "loss": 0.4199, + "step": 12011 + }, + { + "epoch": 0.93, + "grad_norm": 1.074086573011623, + "learning_rate": 2.423640395386018e-07, + "loss": 0.4101, + "step": 12012 + }, + { + "epoch": 0.93, + "grad_norm": 1.3085612278782843, + "learning_rate": 2.418145064407862e-07, + "loss": 0.5049, + "step": 12013 + }, + { + "epoch": 0.93, + "grad_norm": 1.2489764791553917, + "learning_rate": 2.4126558942756617e-07, + "loss": 0.5287, + "step": 12014 + }, + { + "epoch": 0.93, + "grad_norm": 1.248631347960901, + "learning_rate": 2.407172885335984e-07, + "loss": 0.479, + "step": 12015 + }, + { + "epoch": 0.93, + "grad_norm": 1.241694466495158, + "learning_rate": 2.401696037934997e-07, + "loss": 0.5094, + "step": 12016 + }, + { + "epoch": 0.93, + "grad_norm": 1.2000223257054292, + "learning_rate": 2.39622535241848e-07, + "loss": 0.4584, + "step": 12017 + }, + { + "epoch": 0.93, + "grad_norm": 1.4087392180740177, + "learning_rate": 2.3907608291318217e-07, + "loss": 0.5408, + "step": 12018 + }, + { + "epoch": 0.93, + "grad_norm": 1.0935510181897552, + "learning_rate": 2.3853024684200363e-07, + "loss": 0.45, + "step": 12019 + }, + { + "epoch": 0.93, + "grad_norm": 1.2200969686431282, + "learning_rate": 2.379850270627726e-07, + "loss": 0.5164, + "step": 12020 + }, + { + "epoch": 0.93, + "grad_norm": 1.1740251778789763, + "learning_rate": 2.3744042360991149e-07, + "loss": 0.4775, + "step": 12021 + }, + { + "epoch": 0.93, + "grad_norm": 1.1981639883177975, + "learning_rate": 2.3689643651780614e-07, + "loss": 0.4422, + "step": 12022 + }, + { + "epoch": 0.93, + "grad_norm": 1.311206952707571, + "learning_rate": 2.363530658207991e-07, + "loss": 0.5161, + "step": 12023 + }, + { + "epoch": 0.93, + "grad_norm": 1.3359648049224406, + "learning_rate": 2.3581031155319622e-07, + "loss": 0.4915, + "step": 12024 + }, + { + "epoch": 0.93, + "grad_norm": 1.2361912945423417, + "learning_rate": 2.3526817374926457e-07, + "loss": 0.5389, + "step": 12025 + }, + { + "epoch": 0.93, + "grad_norm": 1.2609622447104212, + "learning_rate": 2.3472665244323346e-07, + "loss": 0.5958, + "step": 12026 + }, + { + "epoch": 0.93, + "grad_norm": 1.2986885167861468, + "learning_rate": 2.3418574766929215e-07, + "loss": 0.4672, + "step": 12027 + }, + { + "epoch": 0.93, + "grad_norm": 1.194463740175709, + "learning_rate": 2.336454594615878e-07, + "loss": 0.5082, + "step": 12028 + }, + { + "epoch": 0.93, + "grad_norm": 1.2091560303535505, + "learning_rate": 2.331057878542342e-07, + "loss": 0.481, + "step": 12029 + }, + { + "epoch": 0.93, + "grad_norm": 1.3155809753010863, + "learning_rate": 2.3256673288130194e-07, + "loss": 0.5244, + "step": 12030 + }, + { + "epoch": 0.93, + "grad_norm": 1.2707554006392356, + "learning_rate": 2.3202829457682597e-07, + "loss": 0.5044, + "step": 12031 + }, + { + "epoch": 0.93, + "grad_norm": 1.2441987842997217, + "learning_rate": 2.314904729748002e-07, + "loss": 0.5169, + "step": 12032 + }, + { + "epoch": 0.93, + "grad_norm": 1.3247685374215392, + "learning_rate": 2.309532681091786e-07, + "loss": 0.5618, + "step": 12033 + }, + { + "epoch": 0.93, + "grad_norm": 1.1598597703228415, + "learning_rate": 2.3041668001387852e-07, + "loss": 0.4502, + "step": 12034 + }, + { + "epoch": 0.93, + "grad_norm": 1.146899328571442, + "learning_rate": 2.2988070872277834e-07, + "loss": 0.4714, + "step": 12035 + }, + { + "epoch": 0.93, + "grad_norm": 1.446801980073105, + "learning_rate": 2.293453542697166e-07, + "loss": 0.5608, + "step": 12036 + }, + { + "epoch": 0.93, + "grad_norm": 1.3620473092162897, + "learning_rate": 2.2881061668849292e-07, + "loss": 0.4841, + "step": 12037 + }, + { + "epoch": 0.93, + "grad_norm": 1.2709417401128855, + "learning_rate": 2.2827649601286693e-07, + "loss": 0.559, + "step": 12038 + }, + { + "epoch": 0.93, + "grad_norm": 1.1602017685922585, + "learning_rate": 2.2774299227656282e-07, + "loss": 0.4496, + "step": 12039 + }, + { + "epoch": 0.93, + "grad_norm": 1.0776044169235615, + "learning_rate": 2.272101055132603e-07, + "loss": 0.4796, + "step": 12040 + }, + { + "epoch": 0.93, + "grad_norm": 1.2741059029367194, + "learning_rate": 2.2667783575660463e-07, + "loss": 0.5094, + "step": 12041 + }, + { + "epoch": 0.93, + "grad_norm": 1.2534939096863569, + "learning_rate": 2.2614618304020118e-07, + "loss": 0.5361, + "step": 12042 + }, + { + "epoch": 0.93, + "grad_norm": 1.1791482160133449, + "learning_rate": 2.2561514739761649e-07, + "loss": 0.4875, + "step": 12043 + }, + { + "epoch": 0.93, + "grad_norm": 1.2272947033285686, + "learning_rate": 2.2508472886237586e-07, + "loss": 0.5262, + "step": 12044 + }, + { + "epoch": 0.93, + "grad_norm": 1.1974734052918359, + "learning_rate": 2.2455492746796814e-07, + "loss": 0.4956, + "step": 12045 + }, + { + "epoch": 0.93, + "grad_norm": 1.1009479729454479, + "learning_rate": 2.240257432478421e-07, + "loss": 0.47, + "step": 12046 + }, + { + "epoch": 0.93, + "grad_norm": 1.2522820146351907, + "learning_rate": 2.234971762354099e-07, + "loss": 0.5039, + "step": 12047 + }, + { + "epoch": 0.93, + "grad_norm": 1.3355769959294632, + "learning_rate": 2.229692264640404e-07, + "loss": 0.5158, + "step": 12048 + }, + { + "epoch": 0.93, + "grad_norm": 1.1656456237277526, + "learning_rate": 2.2244189396706695e-07, + "loss": 0.5079, + "step": 12049 + }, + { + "epoch": 0.93, + "grad_norm": 1.2153635170376567, + "learning_rate": 2.219151787777807e-07, + "loss": 0.5088, + "step": 12050 + }, + { + "epoch": 0.93, + "grad_norm": 1.2055785687228227, + "learning_rate": 2.2138908092943834e-07, + "loss": 0.5088, + "step": 12051 + }, + { + "epoch": 0.93, + "grad_norm": 1.1917718545243419, + "learning_rate": 2.2086360045525444e-07, + "loss": 0.4959, + "step": 12052 + }, + { + "epoch": 0.94, + "grad_norm": 1.3766810804509986, + "learning_rate": 2.2033873738840584e-07, + "loss": 0.4894, + "step": 12053 + }, + { + "epoch": 0.94, + "grad_norm": 1.1321325431876532, + "learning_rate": 2.1981449176202818e-07, + "loss": 0.4704, + "step": 12054 + }, + { + "epoch": 0.94, + "grad_norm": 1.1921969432131843, + "learning_rate": 2.192908636092206e-07, + "loss": 0.4869, + "step": 12055 + }, + { + "epoch": 0.94, + "grad_norm": 1.2414569745944832, + "learning_rate": 2.1876785296304214e-07, + "loss": 0.5467, + "step": 12056 + }, + { + "epoch": 0.94, + "grad_norm": 1.124624825544048, + "learning_rate": 2.1824545985651535e-07, + "loss": 0.4943, + "step": 12057 + }, + { + "epoch": 0.94, + "grad_norm": 1.2874571080146386, + "learning_rate": 2.1772368432261935e-07, + "loss": 0.4755, + "step": 12058 + }, + { + "epoch": 0.94, + "grad_norm": 1.155986188743347, + "learning_rate": 2.1720252639429674e-07, + "loss": 0.4736, + "step": 12059 + }, + { + "epoch": 0.94, + "grad_norm": 1.2450519858847429, + "learning_rate": 2.1668198610445114e-07, + "loss": 0.4918, + "step": 12060 + }, + { + "epoch": 0.94, + "grad_norm": 1.0777794104385683, + "learning_rate": 2.1616206348594737e-07, + "loss": 0.4503, + "step": 12061 + }, + { + "epoch": 0.94, + "grad_norm": 1.2460140095132886, + "learning_rate": 2.1564275857160922e-07, + "loss": 0.4804, + "step": 12062 + }, + { + "epoch": 0.94, + "grad_norm": 1.1207008337859716, + "learning_rate": 2.151240713942271e-07, + "loss": 0.4991, + "step": 12063 + }, + { + "epoch": 0.94, + "grad_norm": 1.2149277930918623, + "learning_rate": 2.1460600198654368e-07, + "loss": 0.49, + "step": 12064 + }, + { + "epoch": 0.94, + "grad_norm": 1.17303587317349, + "learning_rate": 2.1408855038126953e-07, + "loss": 0.4569, + "step": 12065 + }, + { + "epoch": 0.94, + "grad_norm": 1.21212734709437, + "learning_rate": 2.135717166110729e-07, + "loss": 0.4933, + "step": 12066 + }, + { + "epoch": 0.94, + "grad_norm": 1.0934133283980239, + "learning_rate": 2.1305550070858773e-07, + "loss": 0.4728, + "step": 12067 + }, + { + "epoch": 0.94, + "grad_norm": 1.3152276261295004, + "learning_rate": 2.1253990270640013e-07, + "loss": 0.5402, + "step": 12068 + }, + { + "epoch": 0.94, + "grad_norm": 1.2297246226968586, + "learning_rate": 2.1202492263706743e-07, + "loss": 0.5255, + "step": 12069 + }, + { + "epoch": 0.94, + "grad_norm": 1.1621308140420006, + "learning_rate": 2.1151056053309915e-07, + "loss": 0.4744, + "step": 12070 + }, + { + "epoch": 0.94, + "grad_norm": 1.225138426764048, + "learning_rate": 2.1099681642697156e-07, + "loss": 0.4856, + "step": 12071 + }, + { + "epoch": 0.94, + "grad_norm": 1.1916113004738296, + "learning_rate": 2.104836903511198e-07, + "loss": 0.4949, + "step": 12072 + }, + { + "epoch": 0.94, + "grad_norm": 1.3308733608808658, + "learning_rate": 2.0997118233794023e-07, + "loss": 0.4825, + "step": 12073 + }, + { + "epoch": 0.94, + "grad_norm": 1.1602075225083806, + "learning_rate": 2.0945929241978913e-07, + "loss": 0.4781, + "step": 12074 + }, + { + "epoch": 0.94, + "grad_norm": 1.176962832244029, + "learning_rate": 2.089480206289851e-07, + "loss": 0.5153, + "step": 12075 + }, + { + "epoch": 0.94, + "grad_norm": 1.2696486668207583, + "learning_rate": 2.0843736699780792e-07, + "loss": 0.4938, + "step": 12076 + }, + { + "epoch": 0.94, + "grad_norm": 1.2157514315170592, + "learning_rate": 2.0792733155849842e-07, + "loss": 0.5243, + "step": 12077 + }, + { + "epoch": 0.94, + "grad_norm": 1.1521961699558692, + "learning_rate": 2.074179143432564e-07, + "loss": 0.4574, + "step": 12078 + }, + { + "epoch": 0.94, + "grad_norm": 1.1489113589126132, + "learning_rate": 2.0690911538424507e-07, + "loss": 0.4659, + "step": 12079 + }, + { + "epoch": 0.94, + "grad_norm": 1.195690967755979, + "learning_rate": 2.0640093471358648e-07, + "loss": 0.5091, + "step": 12080 + }, + { + "epoch": 0.94, + "grad_norm": 1.20140458000584, + "learning_rate": 2.0589337236336493e-07, + "loss": 0.5135, + "step": 12081 + }, + { + "epoch": 0.94, + "grad_norm": 1.3190972869775741, + "learning_rate": 2.0538642836562484e-07, + "loss": 0.4888, + "step": 12082 + }, + { + "epoch": 0.94, + "grad_norm": 1.2585202234103547, + "learning_rate": 2.0488010275237502e-07, + "loss": 0.5402, + "step": 12083 + }, + { + "epoch": 0.94, + "grad_norm": 1.3221546391348353, + "learning_rate": 2.0437439555557993e-07, + "loss": 0.5438, + "step": 12084 + }, + { + "epoch": 0.94, + "grad_norm": 1.203023782410991, + "learning_rate": 2.0386930680716732e-07, + "loss": 0.4829, + "step": 12085 + }, + { + "epoch": 0.94, + "grad_norm": 1.2283256153362836, + "learning_rate": 2.0336483653902727e-07, + "loss": 0.515, + "step": 12086 + }, + { + "epoch": 0.94, + "grad_norm": 1.2224562118738795, + "learning_rate": 2.0286098478300986e-07, + "loss": 0.4931, + "step": 12087 + }, + { + "epoch": 0.94, + "grad_norm": 1.1949322164137284, + "learning_rate": 2.0235775157092407e-07, + "loss": 0.4824, + "step": 12088 + }, + { + "epoch": 0.94, + "grad_norm": 1.3721185750380827, + "learning_rate": 2.0185513693454338e-07, + "loss": 0.5229, + "step": 12089 + }, + { + "epoch": 0.94, + "grad_norm": 1.2677016469903972, + "learning_rate": 2.01353140905598e-07, + "loss": 0.5052, + "step": 12090 + }, + { + "epoch": 0.94, + "grad_norm": 1.088415159817075, + "learning_rate": 2.0085176351578472e-07, + "loss": 0.482, + "step": 12091 + }, + { + "epoch": 0.94, + "grad_norm": 1.2008831390457646, + "learning_rate": 2.0035100479675607e-07, + "loss": 0.4701, + "step": 12092 + }, + { + "epoch": 0.94, + "grad_norm": 1.3032987135771403, + "learning_rate": 1.9985086478012782e-07, + "loss": 0.4644, + "step": 12093 + }, + { + "epoch": 0.94, + "grad_norm": 1.2269952095827388, + "learning_rate": 1.9935134349747698e-07, + "loss": 0.5035, + "step": 12094 + }, + { + "epoch": 0.94, + "grad_norm": 1.2716009552419554, + "learning_rate": 1.9885244098034052e-07, + "loss": 0.5514, + "step": 12095 + }, + { + "epoch": 0.94, + "grad_norm": 1.1780067748796237, + "learning_rate": 1.9835415726021656e-07, + "loss": 0.489, + "step": 12096 + }, + { + "epoch": 0.94, + "grad_norm": 1.1294849852652744, + "learning_rate": 1.9785649236856442e-07, + "loss": 0.4691, + "step": 12097 + }, + { + "epoch": 0.94, + "grad_norm": 1.1899185147155678, + "learning_rate": 1.9735944633680448e-07, + "loss": 0.4463, + "step": 12098 + }, + { + "epoch": 0.94, + "grad_norm": 1.1943997157886674, + "learning_rate": 1.9686301919631833e-07, + "loss": 0.4662, + "step": 12099 + }, + { + "epoch": 0.94, + "grad_norm": 1.3422750317915735, + "learning_rate": 1.9636721097844648e-07, + "loss": 0.573, + "step": 12100 + }, + { + "epoch": 0.94, + "grad_norm": 1.1873344004760293, + "learning_rate": 1.9587202171449272e-07, + "loss": 0.4481, + "step": 12101 + }, + { + "epoch": 0.94, + "grad_norm": 1.2233954684271022, + "learning_rate": 1.9537745143572096e-07, + "loss": 0.5118, + "step": 12102 + }, + { + "epoch": 0.94, + "grad_norm": 1.3482171259825788, + "learning_rate": 1.948835001733551e-07, + "loss": 0.4809, + "step": 12103 + }, + { + "epoch": 0.94, + "grad_norm": 1.2038698244123216, + "learning_rate": 1.9439016795858357e-07, + "loss": 0.5116, + "step": 12104 + }, + { + "epoch": 0.94, + "grad_norm": 1.1548691829894526, + "learning_rate": 1.9389745482254918e-07, + "loss": 0.5095, + "step": 12105 + }, + { + "epoch": 0.94, + "grad_norm": 1.187787774001014, + "learning_rate": 1.9340536079636263e-07, + "loss": 0.5525, + "step": 12106 + }, + { + "epoch": 0.94, + "grad_norm": 1.199857212551977, + "learning_rate": 1.9291388591109017e-07, + "loss": 0.5203, + "step": 12107 + }, + { + "epoch": 0.94, + "grad_norm": 1.2188206676633773, + "learning_rate": 1.9242303019776253e-07, + "loss": 0.5279, + "step": 12108 + }, + { + "epoch": 0.94, + "grad_norm": 1.220417837756893, + "learning_rate": 1.919327936873694e-07, + "loss": 0.5136, + "step": 12109 + }, + { + "epoch": 0.94, + "grad_norm": 1.1514269268919335, + "learning_rate": 1.9144317641086152e-07, + "loss": 0.5104, + "step": 12110 + }, + { + "epoch": 0.94, + "grad_norm": 1.3972273977338436, + "learning_rate": 1.9095417839915198e-07, + "loss": 0.5184, + "step": 12111 + }, + { + "epoch": 0.94, + "grad_norm": 1.2253323843640653, + "learning_rate": 1.9046579968311274e-07, + "loss": 0.4759, + "step": 12112 + }, + { + "epoch": 0.94, + "grad_norm": 1.2712405860274745, + "learning_rate": 1.8997804029357801e-07, + "loss": 0.4777, + "step": 12113 + }, + { + "epoch": 0.94, + "grad_norm": 1.2710229179843224, + "learning_rate": 1.894909002613432e-07, + "loss": 0.4859, + "step": 12114 + }, + { + "epoch": 0.94, + "grad_norm": 1.2055270504157332, + "learning_rate": 1.8900437961716257e-07, + "loss": 0.5429, + "step": 12115 + }, + { + "epoch": 0.94, + "grad_norm": 1.2961122958722953, + "learning_rate": 1.8851847839175375e-07, + "loss": 0.5402, + "step": 12116 + }, + { + "epoch": 0.94, + "grad_norm": 1.28574777457356, + "learning_rate": 1.8803319661579554e-07, + "loss": 0.515, + "step": 12117 + }, + { + "epoch": 0.94, + "grad_norm": 1.1895949806850161, + "learning_rate": 1.8754853431992348e-07, + "loss": 0.4661, + "step": 12118 + }, + { + "epoch": 0.94, + "grad_norm": 1.2118984224476805, + "learning_rate": 1.870644915347386e-07, + "loss": 0.5025, + "step": 12119 + }, + { + "epoch": 0.94, + "grad_norm": 1.143400447356674, + "learning_rate": 1.865810682907987e-07, + "loss": 0.4463, + "step": 12120 + }, + { + "epoch": 0.94, + "grad_norm": 1.2652369716767213, + "learning_rate": 1.8609826461862824e-07, + "loss": 0.4942, + "step": 12121 + }, + { + "epoch": 0.94, + "grad_norm": 1.1889098983068962, + "learning_rate": 1.8561608054870615e-07, + "loss": 0.4918, + "step": 12122 + }, + { + "epoch": 0.94, + "grad_norm": 1.0636012597097029, + "learning_rate": 1.8513451611147704e-07, + "loss": 0.4285, + "step": 12123 + }, + { + "epoch": 0.94, + "grad_norm": 1.2256293178951103, + "learning_rate": 1.8465357133734542e-07, + "loss": 0.5635, + "step": 12124 + }, + { + "epoch": 0.94, + "grad_norm": 1.2966667375752512, + "learning_rate": 1.8417324625667254e-07, + "loss": 0.488, + "step": 12125 + }, + { + "epoch": 0.94, + "grad_norm": 1.220424675273221, + "learning_rate": 1.8369354089978643e-07, + "loss": 0.4745, + "step": 12126 + }, + { + "epoch": 0.94, + "grad_norm": 1.2795046572076876, + "learning_rate": 1.832144552969728e-07, + "loss": 0.5058, + "step": 12127 + }, + { + "epoch": 0.94, + "grad_norm": 1.2634688484025987, + "learning_rate": 1.827359894784775e-07, + "loss": 0.5507, + "step": 12128 + }, + { + "epoch": 0.94, + "grad_norm": 1.1381632223315028, + "learning_rate": 1.8225814347451077e-07, + "loss": 0.4786, + "step": 12129 + }, + { + "epoch": 0.94, + "grad_norm": 1.1123390359908596, + "learning_rate": 1.817809173152396e-07, + "loss": 0.4237, + "step": 12130 + }, + { + "epoch": 0.94, + "grad_norm": 1.1959656570186052, + "learning_rate": 1.8130431103079437e-07, + "loss": 0.505, + "step": 12131 + }, + { + "epoch": 0.94, + "grad_norm": 1.2462354716962607, + "learning_rate": 1.8082832465126544e-07, + "loss": 0.5069, + "step": 12132 + }, + { + "epoch": 0.94, + "grad_norm": 1.1986920818700946, + "learning_rate": 1.803529582067054e-07, + "loss": 0.4646, + "step": 12133 + }, + { + "epoch": 0.94, + "grad_norm": 1.3195592322807088, + "learning_rate": 1.7987821172712584e-07, + "loss": 0.5974, + "step": 12134 + }, + { + "epoch": 0.94, + "grad_norm": 1.312850950640985, + "learning_rate": 1.7940408524249942e-07, + "loss": 0.5147, + "step": 12135 + }, + { + "epoch": 0.94, + "grad_norm": 1.2052311977810213, + "learning_rate": 1.789305787827611e-07, + "loss": 0.5016, + "step": 12136 + }, + { + "epoch": 0.94, + "grad_norm": 1.314242704647338, + "learning_rate": 1.7845769237780585e-07, + "loss": 0.5303, + "step": 12137 + }, + { + "epoch": 0.94, + "grad_norm": 1.251032927030242, + "learning_rate": 1.779854260574887e-07, + "loss": 0.5468, + "step": 12138 + }, + { + "epoch": 0.94, + "grad_norm": 1.2081209467441474, + "learning_rate": 1.7751377985162689e-07, + "loss": 0.448, + "step": 12139 + }, + { + "epoch": 0.94, + "grad_norm": 1.0903966451479938, + "learning_rate": 1.770427537899966e-07, + "loss": 0.4337, + "step": 12140 + }, + { + "epoch": 0.94, + "grad_norm": 1.2070133738752629, + "learning_rate": 1.7657234790233736e-07, + "loss": 0.4701, + "step": 12141 + }, + { + "epoch": 0.94, + "grad_norm": 1.1493545815011306, + "learning_rate": 1.7610256221834765e-07, + "loss": 0.506, + "step": 12142 + }, + { + "epoch": 0.94, + "grad_norm": 1.2990020406059466, + "learning_rate": 1.756333967676882e-07, + "loss": 0.5221, + "step": 12143 + }, + { + "epoch": 0.94, + "grad_norm": 1.2151532039675281, + "learning_rate": 1.7516485157997975e-07, + "loss": 0.5042, + "step": 12144 + }, + { + "epoch": 0.94, + "grad_norm": 1.4123089281328949, + "learning_rate": 1.746969266848031e-07, + "loss": 0.5706, + "step": 12145 + }, + { + "epoch": 0.94, + "grad_norm": 1.2871002058012264, + "learning_rate": 1.7422962211170236e-07, + "loss": 0.543, + "step": 12146 + }, + { + "epoch": 0.94, + "grad_norm": 1.2048201570809265, + "learning_rate": 1.737629378901795e-07, + "loss": 0.5779, + "step": 12147 + }, + { + "epoch": 0.94, + "grad_norm": 1.1451704304051766, + "learning_rate": 1.7329687404969874e-07, + "loss": 0.4568, + "step": 12148 + }, + { + "epoch": 0.94, + "grad_norm": 1.0856477090044963, + "learning_rate": 1.7283143061968654e-07, + "loss": 0.4355, + "step": 12149 + }, + { + "epoch": 0.94, + "grad_norm": 1.2774099163685264, + "learning_rate": 1.7236660762952606e-07, + "loss": 0.5469, + "step": 12150 + }, + { + "epoch": 0.94, + "grad_norm": 1.2163923886292756, + "learning_rate": 1.7190240510856605e-07, + "loss": 0.4998, + "step": 12151 + }, + { + "epoch": 0.94, + "grad_norm": 1.180338395783769, + "learning_rate": 1.7143882308611305e-07, + "loss": 0.4558, + "step": 12152 + }, + { + "epoch": 0.94, + "grad_norm": 1.3467759062741584, + "learning_rate": 1.7097586159143698e-07, + "loss": 0.5154, + "step": 12153 + }, + { + "epoch": 0.94, + "grad_norm": 1.2212399210362663, + "learning_rate": 1.7051352065376448e-07, + "loss": 0.4634, + "step": 12154 + }, + { + "epoch": 0.94, + "grad_norm": 1.2382064941175563, + "learning_rate": 1.7005180030228886e-07, + "loss": 0.4759, + "step": 12155 + }, + { + "epoch": 0.94, + "grad_norm": 1.17365907286405, + "learning_rate": 1.695907005661568e-07, + "loss": 0.501, + "step": 12156 + }, + { + "epoch": 0.94, + "grad_norm": 1.2413998871934906, + "learning_rate": 1.691302214744839e-07, + "loss": 0.4929, + "step": 12157 + }, + { + "epoch": 0.94, + "grad_norm": 1.2592669779072443, + "learning_rate": 1.6867036305634022e-07, + "loss": 0.5177, + "step": 12158 + }, + { + "epoch": 0.94, + "grad_norm": 1.305444235260479, + "learning_rate": 1.6821112534075924e-07, + "loss": 0.5249, + "step": 12159 + }, + { + "epoch": 0.94, + "grad_norm": 1.2793911731705891, + "learning_rate": 1.6775250835673552e-07, + "loss": 0.5262, + "step": 12160 + }, + { + "epoch": 0.94, + "grad_norm": 1.211365458021725, + "learning_rate": 1.6729451213322255e-07, + "loss": 0.4999, + "step": 12161 + }, + { + "epoch": 0.94, + "grad_norm": 1.2238898346042506, + "learning_rate": 1.6683713669913836e-07, + "loss": 0.5112, + "step": 12162 + }, + { + "epoch": 0.94, + "grad_norm": 1.2393212508850167, + "learning_rate": 1.6638038208335762e-07, + "loss": 0.507, + "step": 12163 + }, + { + "epoch": 0.94, + "grad_norm": 1.2889482852159195, + "learning_rate": 1.6592424831471832e-07, + "loss": 0.5763, + "step": 12164 + }, + { + "epoch": 0.94, + "grad_norm": 1.204404819042177, + "learning_rate": 1.6546873542201858e-07, + "loss": 0.4693, + "step": 12165 + }, + { + "epoch": 0.94, + "grad_norm": 1.1430678141510713, + "learning_rate": 1.650138434340165e-07, + "loss": 0.4736, + "step": 12166 + }, + { + "epoch": 0.94, + "grad_norm": 1.1266679584808017, + "learning_rate": 1.6455957237943354e-07, + "loss": 0.5101, + "step": 12167 + }, + { + "epoch": 0.94, + "grad_norm": 1.2498853154024294, + "learning_rate": 1.6410592228694788e-07, + "loss": 0.4867, + "step": 12168 + }, + { + "epoch": 0.94, + "grad_norm": 1.1998884586311045, + "learning_rate": 1.6365289318520216e-07, + "loss": 0.4719, + "step": 12169 + }, + { + "epoch": 0.94, + "grad_norm": 1.1959578822511996, + "learning_rate": 1.6320048510279906e-07, + "loss": 0.4996, + "step": 12170 + }, + { + "epoch": 0.94, + "grad_norm": 1.3564610435721631, + "learning_rate": 1.6274869806829906e-07, + "loss": 0.537, + "step": 12171 + }, + { + "epoch": 0.94, + "grad_norm": 1.2130775491906285, + "learning_rate": 1.6229753211022825e-07, + "loss": 0.4888, + "step": 12172 + }, + { + "epoch": 0.94, + "grad_norm": 1.10593094616037, + "learning_rate": 1.6184698725706938e-07, + "loss": 0.4739, + "step": 12173 + }, + { + "epoch": 0.94, + "grad_norm": 1.1251039456983958, + "learning_rate": 1.613970635372686e-07, + "loss": 0.4732, + "step": 12174 + }, + { + "epoch": 0.94, + "grad_norm": 1.208432812816052, + "learning_rate": 1.6094776097923205e-07, + "loss": 0.4979, + "step": 12175 + }, + { + "epoch": 0.94, + "grad_norm": 1.228300576120459, + "learning_rate": 1.6049907961132595e-07, + "loss": 0.5279, + "step": 12176 + }, + { + "epoch": 0.94, + "grad_norm": 1.178895242183351, + "learning_rate": 1.6005101946187873e-07, + "loss": 0.4616, + "step": 12177 + }, + { + "epoch": 0.94, + "grad_norm": 1.2592580319743523, + "learning_rate": 1.596035805591778e-07, + "loss": 0.5268, + "step": 12178 + }, + { + "epoch": 0.94, + "grad_norm": 1.2461039862615906, + "learning_rate": 1.5915676293147275e-07, + "loss": 0.5007, + "step": 12179 + }, + { + "epoch": 0.94, + "grad_norm": 1.1933419933395568, + "learning_rate": 1.5871056660697326e-07, + "loss": 0.4544, + "step": 12180 + }, + { + "epoch": 0.94, + "grad_norm": 1.2475679102361281, + "learning_rate": 1.58264991613849e-07, + "loss": 0.5029, + "step": 12181 + }, + { + "epoch": 0.95, + "grad_norm": 1.1796509187597046, + "learning_rate": 1.5782003798023306e-07, + "loss": 0.4684, + "step": 12182 + }, + { + "epoch": 0.95, + "grad_norm": 1.1946990489675904, + "learning_rate": 1.5737570573421735e-07, + "loss": 0.5202, + "step": 12183 + }, + { + "epoch": 0.95, + "grad_norm": 1.237524047772037, + "learning_rate": 1.56931994903855e-07, + "loss": 0.5776, + "step": 12184 + }, + { + "epoch": 0.95, + "grad_norm": 1.1674334980487466, + "learning_rate": 1.5648890551715924e-07, + "loss": 0.4949, + "step": 12185 + }, + { + "epoch": 0.95, + "grad_norm": 1.3034596404575625, + "learning_rate": 1.5604643760210426e-07, + "loss": 0.4857, + "step": 12186 + }, + { + "epoch": 0.95, + "grad_norm": 1.1972559668694358, + "learning_rate": 1.5560459118662663e-07, + "loss": 0.5006, + "step": 12187 + }, + { + "epoch": 0.95, + "grad_norm": 1.1583195799873276, + "learning_rate": 1.5516336629862073e-07, + "loss": 0.4864, + "step": 12188 + }, + { + "epoch": 0.95, + "grad_norm": 1.302249206150199, + "learning_rate": 1.5472276296594424e-07, + "loss": 0.5478, + "step": 12189 + }, + { + "epoch": 0.95, + "grad_norm": 1.4013798198064138, + "learning_rate": 1.5428278121641494e-07, + "loss": 0.5301, + "step": 12190 + }, + { + "epoch": 0.95, + "grad_norm": 1.1608281100708564, + "learning_rate": 1.538434210778106e-07, + "loss": 0.5069, + "step": 12191 + }, + { + "epoch": 0.95, + "grad_norm": 1.3397885414070871, + "learning_rate": 1.5340468257787012e-07, + "loss": 0.5336, + "step": 12192 + }, + { + "epoch": 0.95, + "grad_norm": 1.2724435121480506, + "learning_rate": 1.5296656574429469e-07, + "loss": 0.5334, + "step": 12193 + }, + { + "epoch": 0.95, + "grad_norm": 1.240427365643985, + "learning_rate": 1.5252907060474332e-07, + "loss": 0.5081, + "step": 12194 + }, + { + "epoch": 0.95, + "grad_norm": 1.1448613234418503, + "learning_rate": 1.5209219718683833e-07, + "loss": 0.509, + "step": 12195 + }, + { + "epoch": 0.95, + "grad_norm": 1.1004543276525134, + "learning_rate": 1.5165594551816209e-07, + "loss": 0.4182, + "step": 12196 + }, + { + "epoch": 0.95, + "grad_norm": 1.2538325683241198, + "learning_rate": 1.5122031562625593e-07, + "loss": 0.5403, + "step": 12197 + }, + { + "epoch": 0.95, + "grad_norm": 1.1693993931748687, + "learning_rate": 1.5078530753862453e-07, + "loss": 0.4503, + "step": 12198 + }, + { + "epoch": 0.95, + "grad_norm": 1.0690122628360594, + "learning_rate": 1.5035092128273144e-07, + "loss": 0.4536, + "step": 12199 + }, + { + "epoch": 0.95, + "grad_norm": 1.287949191130858, + "learning_rate": 1.4991715688600362e-07, + "loss": 0.5689, + "step": 12200 + }, + { + "epoch": 0.95, + "grad_norm": 1.1418118507533332, + "learning_rate": 1.494840143758236e-07, + "loss": 0.4651, + "step": 12201 + }, + { + "epoch": 0.95, + "grad_norm": 1.2050147140364351, + "learning_rate": 1.4905149377954064e-07, + "loss": 0.5358, + "step": 12202 + }, + { + "epoch": 0.95, + "grad_norm": 1.2112102570523997, + "learning_rate": 1.4861959512446067e-07, + "loss": 0.4725, + "step": 12203 + }, + { + "epoch": 0.95, + "grad_norm": 1.1682220490514175, + "learning_rate": 1.4818831843785297e-07, + "loss": 0.4725, + "step": 12204 + }, + { + "epoch": 0.95, + "grad_norm": 1.2655820957022974, + "learning_rate": 1.4775766374694466e-07, + "loss": 0.5276, + "step": 12205 + }, + { + "epoch": 0.95, + "grad_norm": 1.2657145244818975, + "learning_rate": 1.4732763107892734e-07, + "loss": 0.5174, + "step": 12206 + }, + { + "epoch": 0.95, + "grad_norm": 1.193764925159402, + "learning_rate": 1.4689822046094816e-07, + "loss": 0.4801, + "step": 12207 + }, + { + "epoch": 0.95, + "grad_norm": 1.3143625214002959, + "learning_rate": 1.4646943192011986e-07, + "loss": 0.5459, + "step": 12208 + }, + { + "epoch": 0.95, + "grad_norm": 1.2619091161959535, + "learning_rate": 1.4604126548351416e-07, + "loss": 0.5323, + "step": 12209 + }, + { + "epoch": 0.95, + "grad_norm": 1.3238760355600567, + "learning_rate": 1.4561372117816276e-07, + "loss": 0.5621, + "step": 12210 + }, + { + "epoch": 0.95, + "grad_norm": 1.2289468223824649, + "learning_rate": 1.451867990310596e-07, + "loss": 0.4882, + "step": 12211 + }, + { + "epoch": 0.95, + "grad_norm": 1.2559498328680467, + "learning_rate": 1.4476049906915756e-07, + "loss": 0.4945, + "step": 12212 + }, + { + "epoch": 0.95, + "grad_norm": 1.1901960386156154, + "learning_rate": 1.4433482131937183e-07, + "loss": 0.4715, + "step": 12213 + }, + { + "epoch": 0.95, + "grad_norm": 1.2658871508708545, + "learning_rate": 1.439097658085764e-07, + "loss": 0.5432, + "step": 12214 + }, + { + "epoch": 0.95, + "grad_norm": 1.3043492044220428, + "learning_rate": 1.4348533256360985e-07, + "loss": 0.4938, + "step": 12215 + }, + { + "epoch": 0.95, + "grad_norm": 1.274175979766508, + "learning_rate": 1.430615216112674e-07, + "loss": 0.4766, + "step": 12216 + }, + { + "epoch": 0.95, + "grad_norm": 1.1647408064466906, + "learning_rate": 1.426383329783043e-07, + "loss": 0.4779, + "step": 12217 + }, + { + "epoch": 0.95, + "grad_norm": 1.2916490902268833, + "learning_rate": 1.4221576669144144e-07, + "loss": 0.5268, + "step": 12218 + }, + { + "epoch": 0.95, + "grad_norm": 1.3536018024880108, + "learning_rate": 1.417938227773563e-07, + "loss": 0.5474, + "step": 12219 + }, + { + "epoch": 0.95, + "grad_norm": 1.2916682868865321, + "learning_rate": 1.4137250126268876e-07, + "loss": 0.5385, + "step": 12220 + }, + { + "epoch": 0.95, + "grad_norm": 1.172899586681365, + "learning_rate": 1.409518021740408e-07, + "loss": 0.4714, + "step": 12221 + }, + { + "epoch": 0.95, + "grad_norm": 1.2524518761745613, + "learning_rate": 1.4053172553797012e-07, + "loss": 0.5316, + "step": 12222 + }, + { + "epoch": 0.95, + "grad_norm": 1.2400412581870919, + "learning_rate": 1.4011227138099882e-07, + "loss": 0.4826, + "step": 12223 + }, + { + "epoch": 0.95, + "grad_norm": 1.151157712768384, + "learning_rate": 1.3969343972961124e-07, + "loss": 0.4675, + "step": 12224 + }, + { + "epoch": 0.95, + "grad_norm": 1.2691926944908332, + "learning_rate": 1.3927523061024962e-07, + "loss": 0.538, + "step": 12225 + }, + { + "epoch": 0.95, + "grad_norm": 1.2552149232978944, + "learning_rate": 1.3885764404931835e-07, + "loss": 0.5108, + "step": 12226 + }, + { + "epoch": 0.95, + "grad_norm": 1.1463036467832868, + "learning_rate": 1.3844068007317856e-07, + "loss": 0.4776, + "step": 12227 + }, + { + "epoch": 0.95, + "grad_norm": 1.2764279877148006, + "learning_rate": 1.3802433870815922e-07, + "loss": 0.5208, + "step": 12228 + }, + { + "epoch": 0.95, + "grad_norm": 1.1551585329751224, + "learning_rate": 1.3760861998054264e-07, + "loss": 0.4995, + "step": 12229 + }, + { + "epoch": 0.95, + "grad_norm": 1.244411468967765, + "learning_rate": 1.3719352391657893e-07, + "loss": 0.5064, + "step": 12230 + }, + { + "epoch": 0.95, + "grad_norm": 1.115744612663472, + "learning_rate": 1.3677905054247265e-07, + "loss": 0.4775, + "step": 12231 + }, + { + "epoch": 0.95, + "grad_norm": 1.1452109756332838, + "learning_rate": 1.363651998843929e-07, + "loss": 0.4523, + "step": 12232 + }, + { + "epoch": 0.95, + "grad_norm": 1.2152056875493829, + "learning_rate": 1.3595197196846655e-07, + "loss": 0.4958, + "step": 12233 + }, + { + "epoch": 0.95, + "grad_norm": 1.38415849770913, + "learning_rate": 1.3553936682078494e-07, + "loss": 0.5255, + "step": 12234 + }, + { + "epoch": 0.95, + "grad_norm": 1.298169374841616, + "learning_rate": 1.3512738446739726e-07, + "loss": 0.5321, + "step": 12235 + }, + { + "epoch": 0.95, + "grad_norm": 1.1781939215699344, + "learning_rate": 1.347160249343149e-07, + "loss": 0.4636, + "step": 12236 + }, + { + "epoch": 0.95, + "grad_norm": 1.242034326862186, + "learning_rate": 1.34305288247506e-07, + "loss": 0.5068, + "step": 12237 + }, + { + "epoch": 0.95, + "grad_norm": 1.2131952219650455, + "learning_rate": 1.3389517443290535e-07, + "loss": 0.5173, + "step": 12238 + }, + { + "epoch": 0.95, + "grad_norm": 1.2971394280789845, + "learning_rate": 1.3348568351640446e-07, + "loss": 0.5145, + "step": 12239 + }, + { + "epoch": 0.95, + "grad_norm": 1.3390528176668712, + "learning_rate": 1.3307681552385598e-07, + "loss": 0.5027, + "step": 12240 + }, + { + "epoch": 0.95, + "grad_norm": 1.1539731160640494, + "learning_rate": 1.3266857048107706e-07, + "loss": 0.4661, + "step": 12241 + }, + { + "epoch": 0.95, + "grad_norm": 1.0989461901146003, + "learning_rate": 1.322609484138382e-07, + "loss": 0.4582, + "step": 12242 + }, + { + "epoch": 0.95, + "grad_norm": 1.2298251451866886, + "learning_rate": 1.3185394934787766e-07, + "loss": 0.5426, + "step": 12243 + }, + { + "epoch": 0.95, + "grad_norm": 1.3323725377826037, + "learning_rate": 1.3144757330888934e-07, + "loss": 0.5382, + "step": 12244 + }, + { + "epoch": 0.95, + "grad_norm": 1.2823948746068934, + "learning_rate": 1.3104182032253164e-07, + "loss": 0.5275, + "step": 12245 + }, + { + "epoch": 0.95, + "grad_norm": 1.179933888555367, + "learning_rate": 1.3063669041442074e-07, + "loss": 0.4926, + "step": 12246 + }, + { + "epoch": 0.95, + "grad_norm": 1.3084780969328043, + "learning_rate": 1.302321836101339e-07, + "loss": 0.5481, + "step": 12247 + }, + { + "epoch": 0.95, + "grad_norm": 1.2178695139118783, + "learning_rate": 1.2982829993521185e-07, + "loss": 0.5642, + "step": 12248 + }, + { + "epoch": 0.95, + "grad_norm": 1.1828781331446212, + "learning_rate": 1.2942503941515082e-07, + "loss": 0.4654, + "step": 12249 + }, + { + "epoch": 0.95, + "grad_norm": 1.2747656326307972, + "learning_rate": 1.2902240207541384e-07, + "loss": 0.5077, + "step": 12250 + }, + { + "epoch": 0.95, + "grad_norm": 1.194592227714184, + "learning_rate": 1.286203879414205e-07, + "loss": 0.4848, + "step": 12251 + }, + { + "epoch": 0.95, + "grad_norm": 1.241201285167893, + "learning_rate": 1.2821899703855057e-07, + "loss": 0.5149, + "step": 12252 + }, + { + "epoch": 0.95, + "grad_norm": 1.2949184227806783, + "learning_rate": 1.2781822939214817e-07, + "loss": 0.5862, + "step": 12253 + }, + { + "epoch": 0.95, + "grad_norm": 1.0716363001585496, + "learning_rate": 1.2741808502751417e-07, + "loss": 0.4715, + "step": 12254 + }, + { + "epoch": 0.95, + "grad_norm": 1.2851832888463306, + "learning_rate": 1.2701856396991285e-07, + "loss": 0.4644, + "step": 12255 + }, + { + "epoch": 0.95, + "grad_norm": 1.3106029060768147, + "learning_rate": 1.2661966624456733e-07, + "loss": 0.4978, + "step": 12256 + }, + { + "epoch": 0.95, + "grad_norm": 1.277710188354227, + "learning_rate": 1.2622139187666083e-07, + "loss": 0.5025, + "step": 12257 + }, + { + "epoch": 0.95, + "grad_norm": 1.1920888477470342, + "learning_rate": 1.2582374089134096e-07, + "loss": 0.4723, + "step": 12258 + }, + { + "epoch": 0.95, + "grad_norm": 1.1803714209674714, + "learning_rate": 1.2542671331371214e-07, + "loss": 0.4962, + "step": 12259 + }, + { + "epoch": 0.95, + "grad_norm": 1.1894195000667245, + "learning_rate": 1.2503030916884097e-07, + "loss": 0.4883, + "step": 12260 + }, + { + "epoch": 0.95, + "grad_norm": 1.3001962898689665, + "learning_rate": 1.2463452848175516e-07, + "loss": 0.5612, + "step": 12261 + }, + { + "epoch": 0.95, + "grad_norm": 1.3101333443283396, + "learning_rate": 1.2423937127744146e-07, + "loss": 0.5686, + "step": 12262 + }, + { + "epoch": 0.95, + "grad_norm": 1.1750593028422982, + "learning_rate": 1.2384483758084765e-07, + "loss": 0.4837, + "step": 12263 + }, + { + "epoch": 0.95, + "grad_norm": 1.2637020148761489, + "learning_rate": 1.234509274168838e-07, + "loss": 0.479, + "step": 12264 + }, + { + "epoch": 0.95, + "grad_norm": 1.1423764303482071, + "learning_rate": 1.2305764081042003e-07, + "loss": 0.4461, + "step": 12265 + }, + { + "epoch": 0.95, + "grad_norm": 1.2452852502232261, + "learning_rate": 1.226649777862854e-07, + "loss": 0.5491, + "step": 12266 + }, + { + "epoch": 0.95, + "grad_norm": 1.205153452097077, + "learning_rate": 1.2227293836927112e-07, + "loss": 0.5095, + "step": 12267 + }, + { + "epoch": 0.95, + "grad_norm": 1.1616423365609658, + "learning_rate": 1.2188152258412855e-07, + "loss": 0.4445, + "step": 12268 + }, + { + "epoch": 0.95, + "grad_norm": 1.328134424512747, + "learning_rate": 1.2149073045557014e-07, + "loss": 0.5564, + "step": 12269 + }, + { + "epoch": 0.95, + "grad_norm": 1.2008558400304161, + "learning_rate": 1.2110056200826725e-07, + "loss": 0.4861, + "step": 12270 + }, + { + "epoch": 0.95, + "grad_norm": 1.1278239411164566, + "learning_rate": 1.2071101726685464e-07, + "loss": 0.4918, + "step": 12271 + }, + { + "epoch": 0.95, + "grad_norm": 1.3167760312658008, + "learning_rate": 1.2032209625592705e-07, + "loss": 0.5583, + "step": 12272 + }, + { + "epoch": 0.95, + "grad_norm": 1.2127233810727611, + "learning_rate": 1.199337990000371e-07, + "loss": 0.5628, + "step": 12273 + }, + { + "epoch": 0.95, + "grad_norm": 1.3518681704224773, + "learning_rate": 1.195461255237007e-07, + "loss": 0.5273, + "step": 12274 + }, + { + "epoch": 0.95, + "grad_norm": 1.2611280540741643, + "learning_rate": 1.1915907585139385e-07, + "loss": 0.4632, + "step": 12275 + }, + { + "epoch": 0.95, + "grad_norm": 1.293948168481518, + "learning_rate": 1.1877265000755367e-07, + "loss": 0.5143, + "step": 12276 + }, + { + "epoch": 0.95, + "grad_norm": 1.1330178962499473, + "learning_rate": 1.1838684801657619e-07, + "loss": 0.436, + "step": 12277 + }, + { + "epoch": 0.95, + "grad_norm": 1.3154319530940493, + "learning_rate": 1.1800166990281858e-07, + "loss": 0.5554, + "step": 12278 + }, + { + "epoch": 0.95, + "grad_norm": 1.2347074616251603, + "learning_rate": 1.1761711569060141e-07, + "loss": 0.4567, + "step": 12279 + }, + { + "epoch": 0.95, + "grad_norm": 1.1541361688837604, + "learning_rate": 1.1723318540420081e-07, + "loss": 0.4389, + "step": 12280 + }, + { + "epoch": 0.95, + "grad_norm": 1.2898600105355889, + "learning_rate": 1.1684987906785739e-07, + "loss": 0.496, + "step": 12281 + }, + { + "epoch": 0.95, + "grad_norm": 1.2520016379635124, + "learning_rate": 1.164671967057729e-07, + "loss": 0.5243, + "step": 12282 + }, + { + "epoch": 0.95, + "grad_norm": 1.24934470166198, + "learning_rate": 1.1608513834210578e-07, + "loss": 0.4977, + "step": 12283 + }, + { + "epoch": 0.95, + "grad_norm": 1.2150429320108975, + "learning_rate": 1.1570370400097786e-07, + "loss": 0.4987, + "step": 12284 + }, + { + "epoch": 0.95, + "grad_norm": 1.1670298805693553, + "learning_rate": 1.1532289370647209e-07, + "loss": 0.4671, + "step": 12285 + }, + { + "epoch": 0.95, + "grad_norm": 1.2199556793698574, + "learning_rate": 1.1494270748263037e-07, + "loss": 0.4974, + "step": 12286 + }, + { + "epoch": 0.95, + "grad_norm": 1.2136574009184797, + "learning_rate": 1.1456314535345569e-07, + "loss": 0.4839, + "step": 12287 + }, + { + "epoch": 0.95, + "grad_norm": 1.2427437454554302, + "learning_rate": 1.1418420734291113e-07, + "loss": 0.5363, + "step": 12288 + }, + { + "epoch": 0.95, + "grad_norm": 1.1346942399858833, + "learning_rate": 1.1380589347492199e-07, + "loss": 0.4042, + "step": 12289 + }, + { + "epoch": 0.95, + "grad_norm": 1.289938704047202, + "learning_rate": 1.134282037733725e-07, + "loss": 0.5281, + "step": 12290 + }, + { + "epoch": 0.95, + "grad_norm": 1.1204729122307449, + "learning_rate": 1.1305113826210911e-07, + "loss": 0.4439, + "step": 12291 + }, + { + "epoch": 0.95, + "grad_norm": 1.1737345374148167, + "learning_rate": 1.1267469696493726e-07, + "loss": 0.4181, + "step": 12292 + }, + { + "epoch": 0.95, + "grad_norm": 1.2308029931852427, + "learning_rate": 1.1229887990562349e-07, + "loss": 0.4944, + "step": 12293 + }, + { + "epoch": 0.95, + "grad_norm": 1.1460344773810023, + "learning_rate": 1.1192368710789547e-07, + "loss": 0.476, + "step": 12294 + }, + { + "epoch": 0.95, + "grad_norm": 1.1794433341068746, + "learning_rate": 1.1154911859543982e-07, + "loss": 0.4919, + "step": 12295 + }, + { + "epoch": 0.95, + "grad_norm": 1.121196888289827, + "learning_rate": 1.111751743919065e-07, + "loss": 0.4683, + "step": 12296 + }, + { + "epoch": 0.95, + "grad_norm": 1.1420167766681575, + "learning_rate": 1.1080185452090553e-07, + "loss": 0.4603, + "step": 12297 + }, + { + "epoch": 0.95, + "grad_norm": 1.210361959213607, + "learning_rate": 1.104291590060036e-07, + "loss": 0.4957, + "step": 12298 + }, + { + "epoch": 0.95, + "grad_norm": 1.239055163059905, + "learning_rate": 1.1005708787073189e-07, + "loss": 0.53, + "step": 12299 + }, + { + "epoch": 0.95, + "grad_norm": 1.1607990474982268, + "learning_rate": 1.096856411385827e-07, + "loss": 0.4737, + "step": 12300 + }, + { + "epoch": 0.95, + "grad_norm": 1.2387028887862161, + "learning_rate": 1.093148188330051e-07, + "loss": 0.5061, + "step": 12301 + }, + { + "epoch": 0.95, + "grad_norm": 1.3665108886575255, + "learning_rate": 1.0894462097741366e-07, + "loss": 0.5371, + "step": 12302 + }, + { + "epoch": 0.95, + "grad_norm": 1.2483374984087723, + "learning_rate": 1.0857504759517856e-07, + "loss": 0.4795, + "step": 12303 + }, + { + "epoch": 0.95, + "grad_norm": 1.141345384503703, + "learning_rate": 1.082060987096345e-07, + "loss": 0.466, + "step": 12304 + }, + { + "epoch": 0.95, + "grad_norm": 1.2948525066408771, + "learning_rate": 1.0783777434407394e-07, + "loss": 0.5574, + "step": 12305 + }, + { + "epoch": 0.95, + "grad_norm": 1.2755014742338422, + "learning_rate": 1.0747007452175051e-07, + "loss": 0.4813, + "step": 12306 + }, + { + "epoch": 0.95, + "grad_norm": 1.36835800201321, + "learning_rate": 1.071029992658823e-07, + "loss": 0.5468, + "step": 12307 + }, + { + "epoch": 0.95, + "grad_norm": 1.1439187555941306, + "learning_rate": 1.0673654859964078e-07, + "loss": 0.5287, + "step": 12308 + }, + { + "epoch": 0.95, + "grad_norm": 1.1009962642166276, + "learning_rate": 1.0637072254616298e-07, + "loss": 0.4413, + "step": 12309 + }, + { + "epoch": 0.95, + "grad_norm": 1.2153051058406228, + "learning_rate": 1.0600552112854712e-07, + "loss": 0.513, + "step": 12310 + }, + { + "epoch": 0.96, + "grad_norm": 1.4056457492896397, + "learning_rate": 1.0564094436984806e-07, + "loss": 0.5283, + "step": 12311 + }, + { + "epoch": 0.96, + "grad_norm": 1.207458025595689, + "learning_rate": 1.0527699229308519e-07, + "loss": 0.5225, + "step": 12312 + }, + { + "epoch": 0.96, + "grad_norm": 1.2940130252035187, + "learning_rate": 1.0491366492123567e-07, + "loss": 0.5125, + "step": 12313 + }, + { + "epoch": 0.96, + "grad_norm": 1.2343347277288745, + "learning_rate": 1.0455096227723782e-07, + "loss": 0.5343, + "step": 12314 + }, + { + "epoch": 0.96, + "grad_norm": 1.2877700337307048, + "learning_rate": 1.0418888438399222e-07, + "loss": 0.5395, + "step": 12315 + }, + { + "epoch": 0.96, + "grad_norm": 1.2487096802519029, + "learning_rate": 1.0382743126435723e-07, + "loss": 0.449, + "step": 12316 + }, + { + "epoch": 0.96, + "grad_norm": 1.1995727811726649, + "learning_rate": 1.0346660294115462e-07, + "loss": 0.5099, + "step": 12317 + }, + { + "epoch": 0.96, + "grad_norm": 1.214555859568537, + "learning_rate": 1.0310639943716505e-07, + "loss": 0.4675, + "step": 12318 + }, + { + "epoch": 0.96, + "grad_norm": 1.2260081988584794, + "learning_rate": 1.027468207751292e-07, + "loss": 0.4819, + "step": 12319 + }, + { + "epoch": 0.96, + "grad_norm": 1.1420835288510485, + "learning_rate": 1.0238786697775006e-07, + "loss": 0.4671, + "step": 12320 + }, + { + "epoch": 0.96, + "grad_norm": 1.2172092088127224, + "learning_rate": 1.0202953806768945e-07, + "loss": 0.4873, + "step": 12321 + }, + { + "epoch": 0.96, + "grad_norm": 1.3107450196155004, + "learning_rate": 1.0167183406757042e-07, + "loss": 0.518, + "step": 12322 + }, + { + "epoch": 0.96, + "grad_norm": 1.177888167479233, + "learning_rate": 1.0131475499997823e-07, + "loss": 0.4877, + "step": 12323 + }, + { + "epoch": 0.96, + "grad_norm": 1.2067505346486307, + "learning_rate": 1.0095830088745595e-07, + "loss": 0.5093, + "step": 12324 + }, + { + "epoch": 0.96, + "grad_norm": 1.1271417152575818, + "learning_rate": 1.0060247175250892e-07, + "loss": 0.5121, + "step": 12325 + }, + { + "epoch": 0.96, + "grad_norm": 1.2373623973753018, + "learning_rate": 1.0024726761760028e-07, + "loss": 0.4876, + "step": 12326 + }, + { + "epoch": 0.96, + "grad_norm": 1.2337505837606269, + "learning_rate": 9.989268850515876e-08, + "loss": 0.4988, + "step": 12327 + }, + { + "epoch": 0.96, + "grad_norm": 1.1731605599185688, + "learning_rate": 9.953873443756979e-08, + "loss": 0.4922, + "step": 12328 + }, + { + "epoch": 0.96, + "grad_norm": 1.2897906472118852, + "learning_rate": 9.918540543717992e-08, + "loss": 0.547, + "step": 12329 + }, + { + "epoch": 0.96, + "grad_norm": 1.202552907101029, + "learning_rate": 9.883270152629686e-08, + "loss": 0.5223, + "step": 12330 + }, + { + "epoch": 0.96, + "grad_norm": 1.218616135899447, + "learning_rate": 9.848062272718839e-08, + "loss": 0.5378, + "step": 12331 + }, + { + "epoch": 0.96, + "grad_norm": 1.198193089754735, + "learning_rate": 9.812916906208337e-08, + "loss": 0.4807, + "step": 12332 + }, + { + "epoch": 0.96, + "grad_norm": 1.2650485197570398, + "learning_rate": 9.777834055317181e-08, + "loss": 0.5138, + "step": 12333 + }, + { + "epoch": 0.96, + "grad_norm": 1.3114702635755955, + "learning_rate": 9.742813722260158e-08, + "loss": 0.5745, + "step": 12334 + }, + { + "epoch": 0.96, + "grad_norm": 1.3081470695051902, + "learning_rate": 9.707855909248387e-08, + "loss": 0.5163, + "step": 12335 + }, + { + "epoch": 0.96, + "grad_norm": 1.217707750740706, + "learning_rate": 9.67296061848888e-08, + "loss": 0.5089, + "step": 12336 + }, + { + "epoch": 0.96, + "grad_norm": 1.1921186474757552, + "learning_rate": 9.638127852184764e-08, + "loss": 0.483, + "step": 12337 + }, + { + "epoch": 0.96, + "grad_norm": 1.3254214426286584, + "learning_rate": 9.60335761253528e-08, + "loss": 0.5436, + "step": 12338 + }, + { + "epoch": 0.96, + "grad_norm": 1.1557377376413358, + "learning_rate": 9.568649901735672e-08, + "loss": 0.491, + "step": 12339 + }, + { + "epoch": 0.96, + "grad_norm": 1.4133291292277972, + "learning_rate": 9.534004721976964e-08, + "loss": 0.5685, + "step": 12340 + }, + { + "epoch": 0.96, + "grad_norm": 1.3218123815688771, + "learning_rate": 9.49942207544674e-08, + "loss": 0.5135, + "step": 12341 + }, + { + "epoch": 0.96, + "grad_norm": 1.0873486797244585, + "learning_rate": 9.464901964328365e-08, + "loss": 0.4766, + "step": 12342 + }, + { + "epoch": 0.96, + "grad_norm": 1.391677029888986, + "learning_rate": 9.430444390801208e-08, + "loss": 0.5173, + "step": 12343 + }, + { + "epoch": 0.96, + "grad_norm": 1.2115421385354124, + "learning_rate": 9.39604935704086e-08, + "loss": 0.4663, + "step": 12344 + }, + { + "epoch": 0.96, + "grad_norm": 1.1313327463984348, + "learning_rate": 9.361716865218584e-08, + "loss": 0.4564, + "step": 12345 + }, + { + "epoch": 0.96, + "grad_norm": 1.2953986071414358, + "learning_rate": 9.327446917502203e-08, + "loss": 0.5349, + "step": 12346 + }, + { + "epoch": 0.96, + "grad_norm": 1.2836237991539876, + "learning_rate": 9.29323951605532e-08, + "loss": 0.4866, + "step": 12347 + }, + { + "epoch": 0.96, + "grad_norm": 1.236916786093268, + "learning_rate": 9.259094663037649e-08, + "loss": 0.4894, + "step": 12348 + }, + { + "epoch": 0.96, + "grad_norm": 1.1705139903454993, + "learning_rate": 9.225012360604802e-08, + "loss": 0.4518, + "step": 12349 + }, + { + "epoch": 0.96, + "grad_norm": 1.1464493338381698, + "learning_rate": 9.190992610908611e-08, + "loss": 0.4778, + "step": 12350 + }, + { + "epoch": 0.96, + "grad_norm": 1.187191370962509, + "learning_rate": 9.157035416097027e-08, + "loss": 0.4557, + "step": 12351 + }, + { + "epoch": 0.96, + "grad_norm": 1.291838183346705, + "learning_rate": 9.123140778313777e-08, + "loss": 0.5257, + "step": 12352 + }, + { + "epoch": 0.96, + "grad_norm": 1.244238354064877, + "learning_rate": 9.08930869969904e-08, + "loss": 0.4983, + "step": 12353 + }, + { + "epoch": 0.96, + "grad_norm": 1.2552400904182437, + "learning_rate": 9.055539182388662e-08, + "loss": 0.5225, + "step": 12354 + }, + { + "epoch": 0.96, + "grad_norm": 1.2814056953175004, + "learning_rate": 9.021832228514715e-08, + "loss": 0.483, + "step": 12355 + }, + { + "epoch": 0.96, + "grad_norm": 1.2992610812100918, + "learning_rate": 8.988187840205164e-08, + "loss": 0.5104, + "step": 12356 + }, + { + "epoch": 0.96, + "grad_norm": 1.1911457715064353, + "learning_rate": 8.954606019584312e-08, + "loss": 0.5231, + "step": 12357 + }, + { + "epoch": 0.96, + "grad_norm": 1.1331069563593128, + "learning_rate": 8.921086768772346e-08, + "loss": 0.4375, + "step": 12358 + }, + { + "epoch": 0.96, + "grad_norm": 1.2591847107959995, + "learning_rate": 8.887630089885357e-08, + "loss": 0.5229, + "step": 12359 + }, + { + "epoch": 0.96, + "grad_norm": 1.2132962787824835, + "learning_rate": 8.854235985035875e-08, + "loss": 0.4656, + "step": 12360 + }, + { + "epoch": 0.96, + "grad_norm": 1.2163328017525223, + "learning_rate": 8.820904456331992e-08, + "loss": 0.5026, + "step": 12361 + }, + { + "epoch": 0.96, + "grad_norm": 1.2499289492441437, + "learning_rate": 8.787635505878245e-08, + "loss": 0.536, + "step": 12362 + }, + { + "epoch": 0.96, + "grad_norm": 1.1698546223105282, + "learning_rate": 8.754429135775178e-08, + "loss": 0.5059, + "step": 12363 + }, + { + "epoch": 0.96, + "grad_norm": 1.131999862162039, + "learning_rate": 8.721285348119113e-08, + "loss": 0.4584, + "step": 12364 + }, + { + "epoch": 0.96, + "grad_norm": 1.3023521857855165, + "learning_rate": 8.688204145002598e-08, + "loss": 0.5216, + "step": 12365 + }, + { + "epoch": 0.96, + "grad_norm": 1.2223730277215143, + "learning_rate": 8.655185528514187e-08, + "loss": 0.5182, + "step": 12366 + }, + { + "epoch": 0.96, + "grad_norm": 1.218679181511168, + "learning_rate": 8.622229500738655e-08, + "loss": 0.4857, + "step": 12367 + }, + { + "epoch": 0.96, + "grad_norm": 1.2070293241318828, + "learning_rate": 8.58933606375667e-08, + "loss": 0.4279, + "step": 12368 + }, + { + "epoch": 0.96, + "grad_norm": 1.3088704286418436, + "learning_rate": 8.556505219644795e-08, + "loss": 0.5166, + "step": 12369 + }, + { + "epoch": 0.96, + "grad_norm": 1.3150833682187608, + "learning_rate": 8.523736970475927e-08, + "loss": 0.4945, + "step": 12370 + }, + { + "epoch": 0.96, + "grad_norm": 1.0519207869756797, + "learning_rate": 8.491031318318854e-08, + "loss": 0.4668, + "step": 12371 + }, + { + "epoch": 0.96, + "grad_norm": 1.089009775995349, + "learning_rate": 8.458388265238593e-08, + "loss": 0.4625, + "step": 12372 + }, + { + "epoch": 0.96, + "grad_norm": 1.2290739843480374, + "learning_rate": 8.425807813295939e-08, + "loss": 0.5232, + "step": 12373 + }, + { + "epoch": 0.96, + "grad_norm": 1.071744086720638, + "learning_rate": 8.393289964547912e-08, + "loss": 0.4929, + "step": 12374 + }, + { + "epoch": 0.96, + "grad_norm": 1.216458685399548, + "learning_rate": 8.360834721047429e-08, + "loss": 0.4963, + "step": 12375 + }, + { + "epoch": 0.96, + "grad_norm": 1.1171888338094365, + "learning_rate": 8.328442084843624e-08, + "loss": 0.467, + "step": 12376 + }, + { + "epoch": 0.96, + "grad_norm": 1.19275806217831, + "learning_rate": 8.296112057981643e-08, + "loss": 0.5244, + "step": 12377 + }, + { + "epoch": 0.96, + "grad_norm": 1.1741212805663994, + "learning_rate": 8.263844642502628e-08, + "loss": 0.463, + "step": 12378 + }, + { + "epoch": 0.96, + "grad_norm": 1.2538473525430873, + "learning_rate": 8.23163984044395e-08, + "loss": 0.4948, + "step": 12379 + }, + { + "epoch": 0.96, + "grad_norm": 1.2948280634205498, + "learning_rate": 8.19949765383854e-08, + "loss": 0.5345, + "step": 12380 + }, + { + "epoch": 0.96, + "grad_norm": 1.2498698166767315, + "learning_rate": 8.167418084715772e-08, + "loss": 0.5011, + "step": 12381 + }, + { + "epoch": 0.96, + "grad_norm": 1.252073189963555, + "learning_rate": 8.135401135101251e-08, + "loss": 0.4759, + "step": 12382 + }, + { + "epoch": 0.96, + "grad_norm": 1.2972628002141737, + "learning_rate": 8.103446807016135e-08, + "loss": 0.5339, + "step": 12383 + }, + { + "epoch": 0.96, + "grad_norm": 1.1711803666364815, + "learning_rate": 8.071555102478035e-08, + "loss": 0.4632, + "step": 12384 + }, + { + "epoch": 0.96, + "grad_norm": 1.1988101225763215, + "learning_rate": 8.039726023500227e-08, + "loss": 0.5281, + "step": 12385 + }, + { + "epoch": 0.96, + "grad_norm": 1.18085457374639, + "learning_rate": 8.007959572092328e-08, + "loss": 0.5131, + "step": 12386 + }, + { + "epoch": 0.96, + "grad_norm": 1.1791846612067254, + "learning_rate": 7.976255750260065e-08, + "loss": 0.4708, + "step": 12387 + }, + { + "epoch": 0.96, + "grad_norm": 1.23472831594887, + "learning_rate": 7.944614560004838e-08, + "loss": 0.5509, + "step": 12388 + }, + { + "epoch": 0.96, + "grad_norm": 1.1848678529670518, + "learning_rate": 7.913036003324492e-08, + "loss": 0.4916, + "step": 12389 + }, + { + "epoch": 0.96, + "grad_norm": 1.253816357774344, + "learning_rate": 7.881520082212657e-08, + "loss": 0.5403, + "step": 12390 + }, + { + "epoch": 0.96, + "grad_norm": 1.2233893783284107, + "learning_rate": 7.850066798659072e-08, + "loss": 0.5231, + "step": 12391 + }, + { + "epoch": 0.96, + "grad_norm": 1.231055516408078, + "learning_rate": 7.818676154649707e-08, + "loss": 0.504, + "step": 12392 + }, + { + "epoch": 0.96, + "grad_norm": 1.3453729277008863, + "learning_rate": 7.787348152166197e-08, + "loss": 0.5042, + "step": 12393 + }, + { + "epoch": 0.96, + "grad_norm": 1.194942591656876, + "learning_rate": 7.756082793186626e-08, + "loss": 0.4935, + "step": 12394 + }, + { + "epoch": 0.96, + "grad_norm": 1.3137209526647342, + "learning_rate": 7.724880079684748e-08, + "loss": 0.5091, + "step": 12395 + }, + { + "epoch": 0.96, + "grad_norm": 1.159682514300226, + "learning_rate": 7.693740013630768e-08, + "loss": 0.5093, + "step": 12396 + }, + { + "epoch": 0.96, + "grad_norm": 1.1842477332510826, + "learning_rate": 7.662662596990555e-08, + "loss": 0.4457, + "step": 12397 + }, + { + "epoch": 0.96, + "grad_norm": 1.2793293024277577, + "learning_rate": 7.631647831726207e-08, + "loss": 0.5212, + "step": 12398 + }, + { + "epoch": 0.96, + "grad_norm": 1.0988704170146235, + "learning_rate": 7.600695719795936e-08, + "loss": 0.4316, + "step": 12399 + }, + { + "epoch": 0.96, + "grad_norm": 1.1589832476901958, + "learning_rate": 7.569806263153734e-08, + "loss": 0.4934, + "step": 12400 + }, + { + "epoch": 0.96, + "grad_norm": 1.1761044283063211, + "learning_rate": 7.53897946375004e-08, + "loss": 0.5132, + "step": 12401 + }, + { + "epoch": 0.96, + "grad_norm": 1.2647926984713764, + "learning_rate": 7.508215323531076e-08, + "loss": 0.4584, + "step": 12402 + }, + { + "epoch": 0.96, + "grad_norm": 1.2613883043418646, + "learning_rate": 7.477513844438955e-08, + "loss": 0.5084, + "step": 12403 + }, + { + "epoch": 0.96, + "grad_norm": 1.09665561014378, + "learning_rate": 7.446875028412126e-08, + "loss": 0.453, + "step": 12404 + }, + { + "epoch": 0.96, + "grad_norm": 1.114006047470416, + "learning_rate": 7.416298877384931e-08, + "loss": 0.4832, + "step": 12405 + }, + { + "epoch": 0.96, + "grad_norm": 1.170540673049112, + "learning_rate": 7.385785393287936e-08, + "loss": 0.4795, + "step": 12406 + }, + { + "epoch": 0.96, + "grad_norm": 1.2339810092707444, + "learning_rate": 7.35533457804749e-08, + "loss": 0.5068, + "step": 12407 + }, + { + "epoch": 0.96, + "grad_norm": 1.1914994500399467, + "learning_rate": 7.324946433586055e-08, + "loss": 0.4664, + "step": 12408 + }, + { + "epoch": 0.96, + "grad_norm": 1.2010054806465922, + "learning_rate": 7.29462096182243e-08, + "loss": 0.5039, + "step": 12409 + }, + { + "epoch": 0.96, + "grad_norm": 1.2048789281685088, + "learning_rate": 7.264358164671082e-08, + "loss": 0.5071, + "step": 12410 + }, + { + "epoch": 0.96, + "grad_norm": 1.2081824679848838, + "learning_rate": 7.234158044042482e-08, + "loss": 0.4874, + "step": 12411 + }, + { + "epoch": 0.96, + "grad_norm": 1.1325617019316214, + "learning_rate": 7.204020601843665e-08, + "loss": 0.4623, + "step": 12412 + }, + { + "epoch": 0.96, + "grad_norm": 1.2255159517648435, + "learning_rate": 7.173945839977103e-08, + "loss": 0.4807, + "step": 12413 + }, + { + "epoch": 0.96, + "grad_norm": 1.2583110129458865, + "learning_rate": 7.143933760341615e-08, + "loss": 0.5206, + "step": 12414 + }, + { + "epoch": 0.96, + "grad_norm": 1.3173090162597205, + "learning_rate": 7.113984364832127e-08, + "loss": 0.5683, + "step": 12415 + }, + { + "epoch": 0.96, + "grad_norm": 1.2134573529250456, + "learning_rate": 7.08409765533935e-08, + "loss": 0.4864, + "step": 12416 + }, + { + "epoch": 0.96, + "grad_norm": 1.2282197289660592, + "learning_rate": 7.054273633750219e-08, + "loss": 0.5317, + "step": 12417 + }, + { + "epoch": 0.96, + "grad_norm": 1.2343168124234116, + "learning_rate": 7.024512301947783e-08, + "loss": 0.5057, + "step": 12418 + }, + { + "epoch": 0.96, + "grad_norm": 1.2378050071838158, + "learning_rate": 6.994813661810984e-08, + "loss": 0.4947, + "step": 12419 + }, + { + "epoch": 0.96, + "grad_norm": 1.1777560863106833, + "learning_rate": 6.965177715214878e-08, + "loss": 0.4611, + "step": 12420 + }, + { + "epoch": 0.96, + "grad_norm": 1.2014387625091465, + "learning_rate": 6.93560446403041e-08, + "loss": 0.4762, + "step": 12421 + }, + { + "epoch": 0.96, + "grad_norm": 1.1815626386648932, + "learning_rate": 6.906093910124756e-08, + "loss": 0.4744, + "step": 12422 + }, + { + "epoch": 0.96, + "grad_norm": 1.2697310539885565, + "learning_rate": 6.876646055361091e-08, + "loss": 0.5249, + "step": 12423 + }, + { + "epoch": 0.96, + "grad_norm": 1.3944536289015632, + "learning_rate": 6.847260901598595e-08, + "loss": 0.634, + "step": 12424 + }, + { + "epoch": 0.96, + "grad_norm": 1.2221366094939305, + "learning_rate": 6.817938450692674e-08, + "loss": 0.5356, + "step": 12425 + }, + { + "epoch": 0.96, + "grad_norm": 1.241579302758814, + "learning_rate": 6.788678704494289e-08, + "loss": 0.4945, + "step": 12426 + }, + { + "epoch": 0.96, + "grad_norm": 1.286607520015567, + "learning_rate": 6.759481664850853e-08, + "loss": 0.5496, + "step": 12427 + }, + { + "epoch": 0.96, + "grad_norm": 1.2600355228835922, + "learning_rate": 6.73034733360589e-08, + "loss": 0.4979, + "step": 12428 + }, + { + "epoch": 0.96, + "grad_norm": 1.2865756929845544, + "learning_rate": 6.70127571259871e-08, + "loss": 0.5709, + "step": 12429 + }, + { + "epoch": 0.96, + "grad_norm": 1.259320273523649, + "learning_rate": 6.672266803664729e-08, + "loss": 0.4574, + "step": 12430 + }, + { + "epoch": 0.96, + "grad_norm": 1.1934977699358034, + "learning_rate": 6.643320608635373e-08, + "loss": 0.5093, + "step": 12431 + }, + { + "epoch": 0.96, + "grad_norm": 1.2332053135250418, + "learning_rate": 6.614437129338402e-08, + "loss": 0.526, + "step": 12432 + }, + { + "epoch": 0.96, + "grad_norm": 1.2581479590756683, + "learning_rate": 6.585616367597025e-08, + "loss": 0.5149, + "step": 12433 + }, + { + "epoch": 0.96, + "grad_norm": 1.1675335127372417, + "learning_rate": 6.556858325231119e-08, + "loss": 0.4959, + "step": 12434 + }, + { + "epoch": 0.96, + "grad_norm": 1.2431266161724495, + "learning_rate": 6.528163004056231e-08, + "loss": 0.5139, + "step": 12435 + }, + { + "epoch": 0.96, + "grad_norm": 1.265639410996514, + "learning_rate": 6.499530405884025e-08, + "loss": 0.4859, + "step": 12436 + }, + { + "epoch": 0.96, + "grad_norm": 1.376424788169291, + "learning_rate": 6.470960532522275e-08, + "loss": 0.474, + "step": 12437 + }, + { + "epoch": 0.96, + "grad_norm": 1.2535500659294578, + "learning_rate": 6.44245338577465e-08, + "loss": 0.5081, + "step": 12438 + }, + { + "epoch": 0.96, + "grad_norm": 1.3082010619408375, + "learning_rate": 6.414008967441155e-08, + "loss": 0.525, + "step": 12439 + }, + { + "epoch": 0.97, + "grad_norm": 1.204484097617681, + "learning_rate": 6.385627279317463e-08, + "loss": 0.4792, + "step": 12440 + }, + { + "epoch": 0.97, + "grad_norm": 1.107075685889368, + "learning_rate": 6.357308323195476e-08, + "loss": 0.4474, + "step": 12441 + }, + { + "epoch": 0.97, + "grad_norm": 1.349187716075802, + "learning_rate": 6.329052100863209e-08, + "loss": 0.5518, + "step": 12442 + }, + { + "epoch": 0.97, + "grad_norm": 1.198955047143343, + "learning_rate": 6.300858614104455e-08, + "loss": 0.466, + "step": 12443 + }, + { + "epoch": 0.97, + "grad_norm": 1.3348395760546958, + "learning_rate": 6.272727864699234e-08, + "loss": 0.5237, + "step": 12444 + }, + { + "epoch": 0.97, + "grad_norm": 1.244967103776308, + "learning_rate": 6.244659854423795e-08, + "loss": 0.4893, + "step": 12445 + }, + { + "epoch": 0.97, + "grad_norm": 1.1692606437709234, + "learning_rate": 6.216654585050052e-08, + "loss": 0.4229, + "step": 12446 + }, + { + "epoch": 0.97, + "grad_norm": 1.1444910973285545, + "learning_rate": 6.188712058346147e-08, + "loss": 0.4876, + "step": 12447 + }, + { + "epoch": 0.97, + "grad_norm": 1.179135780918476, + "learning_rate": 6.160832276076223e-08, + "loss": 0.5015, + "step": 12448 + }, + { + "epoch": 0.97, + "grad_norm": 1.1920127950628068, + "learning_rate": 6.13301524000054e-08, + "loss": 0.4765, + "step": 12449 + }, + { + "epoch": 0.97, + "grad_norm": 1.3099366542555526, + "learning_rate": 6.105260951875247e-08, + "loss": 0.5431, + "step": 12450 + }, + { + "epoch": 0.97, + "grad_norm": 1.2526664903433207, + "learning_rate": 6.077569413452722e-08, + "loss": 0.4864, + "step": 12451 + }, + { + "epoch": 0.97, + "grad_norm": 1.126515744914939, + "learning_rate": 6.049940626481121e-08, + "loss": 0.4542, + "step": 12452 + }, + { + "epoch": 0.97, + "grad_norm": 1.1824531724944594, + "learning_rate": 6.022374592704938e-08, + "loss": 0.5002, + "step": 12453 + }, + { + "epoch": 0.97, + "grad_norm": 1.2310202195360875, + "learning_rate": 5.994871313864448e-08, + "loss": 0.547, + "step": 12454 + }, + { + "epoch": 0.97, + "grad_norm": 1.2322254054532231, + "learning_rate": 5.96743079169615e-08, + "loss": 0.5195, + "step": 12455 + }, + { + "epoch": 0.97, + "grad_norm": 1.3136923233423534, + "learning_rate": 5.9400530279324354e-08, + "loss": 0.4679, + "step": 12456 + }, + { + "epoch": 0.97, + "grad_norm": 1.2298715262268494, + "learning_rate": 5.9127380243019225e-08, + "loss": 0.5101, + "step": 12457 + }, + { + "epoch": 0.97, + "grad_norm": 1.1989055808642282, + "learning_rate": 5.88548578252901e-08, + "loss": 0.5015, + "step": 12458 + }, + { + "epoch": 0.97, + "grad_norm": 1.2786382246341397, + "learning_rate": 5.858296304334321e-08, + "loss": 0.474, + "step": 12459 + }, + { + "epoch": 0.97, + "grad_norm": 1.2971925921100096, + "learning_rate": 5.831169591434593e-08, + "loss": 0.5081, + "step": 12460 + }, + { + "epoch": 0.97, + "grad_norm": 1.189433932345285, + "learning_rate": 5.804105645542235e-08, + "loss": 0.4768, + "step": 12461 + }, + { + "epoch": 0.97, + "grad_norm": 1.2423990897143997, + "learning_rate": 5.7771044683662125e-08, + "loss": 0.5466, + "step": 12462 + }, + { + "epoch": 0.97, + "grad_norm": 1.3103228631986958, + "learning_rate": 5.750166061611051e-08, + "loss": 0.5119, + "step": 12463 + }, + { + "epoch": 0.97, + "grad_norm": 1.2751552954699292, + "learning_rate": 5.7232904269775014e-08, + "loss": 0.5203, + "step": 12464 + }, + { + "epoch": 0.97, + "grad_norm": 1.1144996801814857, + "learning_rate": 5.696477566162428e-08, + "loss": 0.481, + "step": 12465 + }, + { + "epoch": 0.97, + "grad_norm": 1.2273057762987907, + "learning_rate": 5.6697274808587e-08, + "loss": 0.5185, + "step": 12466 + }, + { + "epoch": 0.97, + "grad_norm": 1.158732764353542, + "learning_rate": 5.6430401727550766e-08, + "loss": 0.4804, + "step": 12467 + }, + { + "epoch": 0.97, + "grad_norm": 1.2442779704746576, + "learning_rate": 5.6164156435365435e-08, + "loss": 0.5379, + "step": 12468 + }, + { + "epoch": 0.97, + "grad_norm": 1.2501807082207745, + "learning_rate": 5.5898538948840896e-08, + "loss": 0.4778, + "step": 12469 + }, + { + "epoch": 0.97, + "grad_norm": 1.1344399698951528, + "learning_rate": 5.563354928474596e-08, + "loss": 0.4688, + "step": 12470 + }, + { + "epoch": 0.97, + "grad_norm": 1.278626570650382, + "learning_rate": 5.536918745981168e-08, + "loss": 0.5, + "step": 12471 + }, + { + "epoch": 0.97, + "grad_norm": 1.2756372654356487, + "learning_rate": 5.5105453490728047e-08, + "loss": 0.5128, + "step": 12472 + }, + { + "epoch": 0.97, + "grad_norm": 1.2517831481664743, + "learning_rate": 5.484234739414618e-08, + "loss": 0.5001, + "step": 12473 + }, + { + "epoch": 0.97, + "grad_norm": 1.1978757215495914, + "learning_rate": 5.4579869186676126e-08, + "loss": 0.4772, + "step": 12474 + }, + { + "epoch": 0.97, + "grad_norm": 1.1302540267403467, + "learning_rate": 5.431801888489241e-08, + "loss": 0.515, + "step": 12475 + }, + { + "epoch": 0.97, + "grad_norm": 1.2754454432999216, + "learning_rate": 5.405679650532403e-08, + "loss": 0.5182, + "step": 12476 + }, + { + "epoch": 0.97, + "grad_norm": 1.132319218014642, + "learning_rate": 5.3796202064464454e-08, + "loss": 0.4324, + "step": 12477 + }, + { + "epoch": 0.97, + "grad_norm": 1.265646004208183, + "learning_rate": 5.353623557876608e-08, + "loss": 0.4767, + "step": 12478 + }, + { + "epoch": 0.97, + "grad_norm": 1.2585955723394726, + "learning_rate": 5.3276897064641344e-08, + "loss": 0.5004, + "step": 12479 + }, + { + "epoch": 0.97, + "grad_norm": 1.165475384721079, + "learning_rate": 5.301818653846602e-08, + "loss": 0.4736, + "step": 12480 + }, + { + "epoch": 0.97, + "grad_norm": 1.2184190545321012, + "learning_rate": 5.27601040165715e-08, + "loss": 0.4696, + "step": 12481 + }, + { + "epoch": 0.97, + "grad_norm": 1.1733015913805975, + "learning_rate": 5.250264951525364e-08, + "loss": 0.5176, + "step": 12482 + }, + { + "epoch": 0.97, + "grad_norm": 1.1832925142492279, + "learning_rate": 5.224582305076498e-08, + "loss": 0.5312, + "step": 12483 + }, + { + "epoch": 0.97, + "grad_norm": 1.1988818661554093, + "learning_rate": 5.198962463932145e-08, + "loss": 0.4946, + "step": 12484 + }, + { + "epoch": 0.97, + "grad_norm": 1.2209559796714005, + "learning_rate": 5.173405429709677e-08, + "loss": 0.5064, + "step": 12485 + }, + { + "epoch": 0.97, + "grad_norm": 1.3001314206818908, + "learning_rate": 5.147911204022915e-08, + "loss": 0.5079, + "step": 12486 + }, + { + "epoch": 0.97, + "grad_norm": 1.2037131618198813, + "learning_rate": 5.1224797884812385e-08, + "loss": 0.4723, + "step": 12487 + }, + { + "epoch": 0.97, + "grad_norm": 1.1876966915865976, + "learning_rate": 5.097111184690251e-08, + "loss": 0.5197, + "step": 12488 + }, + { + "epoch": 0.97, + "grad_norm": 1.1044524680634409, + "learning_rate": 5.071805394251672e-08, + "loss": 0.4765, + "step": 12489 + }, + { + "epoch": 0.97, + "grad_norm": 1.1593487430694185, + "learning_rate": 5.046562418763223e-08, + "loss": 0.5001, + "step": 12490 + }, + { + "epoch": 0.97, + "grad_norm": 1.3758537936215205, + "learning_rate": 5.0213822598185194e-08, + "loss": 0.5038, + "step": 12491 + }, + { + "epoch": 0.97, + "grad_norm": 1.1214693616541775, + "learning_rate": 4.996264919007399e-08, + "loss": 0.4701, + "step": 12492 + }, + { + "epoch": 0.97, + "grad_norm": 1.3261214458782196, + "learning_rate": 4.971210397915594e-08, + "loss": 0.5007, + "step": 12493 + }, + { + "epoch": 0.97, + "grad_norm": 1.1851359476904144, + "learning_rate": 4.9462186981249496e-08, + "loss": 0.4835, + "step": 12494 + }, + { + "epoch": 0.97, + "grad_norm": 1.1880002223056367, + "learning_rate": 4.9212898212133154e-08, + "loss": 0.4855, + "step": 12495 + }, + { + "epoch": 0.97, + "grad_norm": 1.274575222538688, + "learning_rate": 4.8964237687546543e-08, + "loss": 0.5616, + "step": 12496 + }, + { + "epoch": 0.97, + "grad_norm": 1.316144243648743, + "learning_rate": 4.871620542318711e-08, + "loss": 0.5992, + "step": 12497 + }, + { + "epoch": 0.97, + "grad_norm": 1.2236908748943929, + "learning_rate": 4.846880143471677e-08, + "loss": 0.5141, + "step": 12498 + }, + { + "epoch": 0.97, + "grad_norm": 1.2364336062205494, + "learning_rate": 4.822202573775303e-08, + "loss": 0.5309, + "step": 12499 + }, + { + "epoch": 0.97, + "grad_norm": 1.2401378203526143, + "learning_rate": 4.797587834787787e-08, + "loss": 0.5368, + "step": 12500 + }, + { + "epoch": 0.97, + "grad_norm": 1.1809495654856321, + "learning_rate": 4.773035928063108e-08, + "loss": 0.4937, + "step": 12501 + }, + { + "epoch": 0.97, + "grad_norm": 1.1918283684044366, + "learning_rate": 4.748546855151359e-08, + "loss": 0.4847, + "step": 12502 + }, + { + "epoch": 0.97, + "grad_norm": 1.198322420923864, + "learning_rate": 4.724120617598637e-08, + "loss": 0.4441, + "step": 12503 + }, + { + "epoch": 0.97, + "grad_norm": 1.226661337982573, + "learning_rate": 4.699757216947154e-08, + "loss": 0.4981, + "step": 12504 + }, + { + "epoch": 0.97, + "grad_norm": 1.2429011951744944, + "learning_rate": 4.675456654735122e-08, + "loss": 0.4803, + "step": 12505 + }, + { + "epoch": 0.97, + "grad_norm": 1.1273467700307132, + "learning_rate": 4.6512189324966484e-08, + "loss": 0.4133, + "step": 12506 + }, + { + "epoch": 0.97, + "grad_norm": 1.3403173560501256, + "learning_rate": 4.627044051762064e-08, + "loss": 0.5396, + "step": 12507 + }, + { + "epoch": 0.97, + "grad_norm": 1.3218808311532189, + "learning_rate": 4.602932014057704e-08, + "loss": 0.5339, + "step": 12508 + }, + { + "epoch": 0.97, + "grad_norm": 1.1948304045610205, + "learning_rate": 4.5788828209056836e-08, + "loss": 0.4882, + "step": 12509 + }, + { + "epoch": 0.97, + "grad_norm": 1.3097140854121707, + "learning_rate": 4.5548964738246774e-08, + "loss": 0.5099, + "step": 12510 + }, + { + "epoch": 0.97, + "grad_norm": 1.2479411335596724, + "learning_rate": 4.530972974328696e-08, + "loss": 0.5339, + "step": 12511 + }, + { + "epoch": 0.97, + "grad_norm": 1.2560117163445763, + "learning_rate": 4.50711232392842e-08, + "loss": 0.5056, + "step": 12512 + }, + { + "epoch": 0.97, + "grad_norm": 1.3385808570359352, + "learning_rate": 4.4833145241302e-08, + "loss": 0.5483, + "step": 12513 + }, + { + "epoch": 0.97, + "grad_norm": 1.1379624736612282, + "learning_rate": 4.4595795764365015e-08, + "loss": 0.4174, + "step": 12514 + }, + { + "epoch": 0.97, + "grad_norm": 1.173764244537531, + "learning_rate": 4.4359074823459025e-08, + "loss": 0.4725, + "step": 12515 + }, + { + "epoch": 0.97, + "grad_norm": 1.1221474404383212, + "learning_rate": 4.412298243352875e-08, + "loss": 0.4476, + "step": 12516 + }, + { + "epoch": 0.97, + "grad_norm": 1.2210884157578026, + "learning_rate": 4.388751860948004e-08, + "loss": 0.5291, + "step": 12517 + }, + { + "epoch": 0.97, + "grad_norm": 1.2707082134418177, + "learning_rate": 4.3652683366178784e-08, + "loss": 0.4512, + "step": 12518 + }, + { + "epoch": 0.97, + "grad_norm": 1.3398238200048331, + "learning_rate": 4.341847671845201e-08, + "loss": 0.552, + "step": 12519 + }, + { + "epoch": 0.97, + "grad_norm": 1.3016117869704498, + "learning_rate": 4.318489868108677e-08, + "loss": 0.5273, + "step": 12520 + }, + { + "epoch": 0.97, + "grad_norm": 1.2658775925239178, + "learning_rate": 4.2951949268827955e-08, + "loss": 0.5208, + "step": 12521 + }, + { + "epoch": 0.97, + "grad_norm": 1.1328662728010623, + "learning_rate": 4.2719628496384894e-08, + "loss": 0.4769, + "step": 12522 + }, + { + "epoch": 0.97, + "grad_norm": 1.2399536295649827, + "learning_rate": 4.248793637842474e-08, + "loss": 0.4867, + "step": 12523 + }, + { + "epoch": 0.97, + "grad_norm": 1.2494636815606133, + "learning_rate": 4.22568729295747e-08, + "loss": 0.5725, + "step": 12524 + }, + { + "epoch": 0.97, + "grad_norm": 1.1977545034140378, + "learning_rate": 4.202643816442309e-08, + "loss": 0.5215, + "step": 12525 + }, + { + "epoch": 0.97, + "grad_norm": 1.2078838115135055, + "learning_rate": 4.179663209751939e-08, + "loss": 0.5288, + "step": 12526 + }, + { + "epoch": 0.97, + "grad_norm": 1.2131186745433011, + "learning_rate": 4.156745474337198e-08, + "loss": 0.4746, + "step": 12527 + }, + { + "epoch": 0.97, + "grad_norm": 1.2790758713713688, + "learning_rate": 4.133890611644931e-08, + "loss": 0.5215, + "step": 12528 + }, + { + "epoch": 0.97, + "grad_norm": 1.2232141653862378, + "learning_rate": 4.111098623118204e-08, + "loss": 0.4987, + "step": 12529 + }, + { + "epoch": 0.97, + "grad_norm": 1.2894984577372375, + "learning_rate": 4.0883695101959774e-08, + "loss": 0.5045, + "step": 12530 + }, + { + "epoch": 0.97, + "grad_norm": 1.2462082095777405, + "learning_rate": 4.0657032743131044e-08, + "loss": 0.5209, + "step": 12531 + }, + { + "epoch": 0.97, + "grad_norm": 1.2267800399401552, + "learning_rate": 4.0430999169007726e-08, + "loss": 0.5071, + "step": 12532 + }, + { + "epoch": 0.97, + "grad_norm": 1.2586559050619879, + "learning_rate": 4.0205594393859513e-08, + "loss": 0.5277, + "step": 12533 + }, + { + "epoch": 0.97, + "grad_norm": 1.1806697671782416, + "learning_rate": 3.9980818431918366e-08, + "loss": 0.459, + "step": 12534 + }, + { + "epoch": 0.97, + "grad_norm": 1.2617609353586483, + "learning_rate": 3.975667129737515e-08, + "loss": 0.5088, + "step": 12535 + }, + { + "epoch": 0.97, + "grad_norm": 1.0412725020744353, + "learning_rate": 3.9533153004381873e-08, + "loss": 0.4463, + "step": 12536 + }, + { + "epoch": 0.97, + "grad_norm": 1.2060807816003063, + "learning_rate": 3.9310263567049476e-08, + "loss": 0.5401, + "step": 12537 + }, + { + "epoch": 0.97, + "grad_norm": 1.1969549824655619, + "learning_rate": 3.9088002999450034e-08, + "loss": 0.5117, + "step": 12538 + }, + { + "epoch": 0.97, + "grad_norm": 1.1970245965041508, + "learning_rate": 3.8866371315616766e-08, + "loss": 0.4764, + "step": 12539 + }, + { + "epoch": 0.97, + "grad_norm": 1.2294792919982487, + "learning_rate": 3.864536852954293e-08, + "loss": 0.5033, + "step": 12540 + }, + { + "epoch": 0.97, + "grad_norm": 1.1885830307275689, + "learning_rate": 3.842499465518068e-08, + "loss": 0.5274, + "step": 12541 + }, + { + "epoch": 0.97, + "grad_norm": 1.1805036134450464, + "learning_rate": 3.8205249706443345e-08, + "loss": 0.4782, + "step": 12542 + }, + { + "epoch": 0.97, + "grad_norm": 1.3933795693284015, + "learning_rate": 3.798613369720427e-08, + "loss": 0.5868, + "step": 12543 + }, + { + "epoch": 0.97, + "grad_norm": 1.248727436797367, + "learning_rate": 3.776764664129684e-08, + "loss": 0.5094, + "step": 12544 + }, + { + "epoch": 0.97, + "grad_norm": 1.1594935621318927, + "learning_rate": 3.7549788552517786e-08, + "loss": 0.4392, + "step": 12545 + }, + { + "epoch": 0.97, + "grad_norm": 1.411331532355703, + "learning_rate": 3.733255944461944e-08, + "loss": 0.5604, + "step": 12546 + }, + { + "epoch": 0.97, + "grad_norm": 1.1500280874388213, + "learning_rate": 3.711595933131751e-08, + "loss": 0.5048, + "step": 12547 + }, + { + "epoch": 0.97, + "grad_norm": 1.225116241545206, + "learning_rate": 3.68999882262866e-08, + "loss": 0.4946, + "step": 12548 + }, + { + "epoch": 0.97, + "grad_norm": 1.2390220183293383, + "learning_rate": 3.668464614316247e-08, + "loss": 0.5018, + "step": 12549 + }, + { + "epoch": 0.97, + "grad_norm": 1.3830168406053522, + "learning_rate": 3.646993309554092e-08, + "loss": 0.508, + "step": 12550 + }, + { + "epoch": 0.97, + "grad_norm": 1.15229367939418, + "learning_rate": 3.6255849096976655e-08, + "loss": 0.4659, + "step": 12551 + }, + { + "epoch": 0.97, + "grad_norm": 1.2691241271783298, + "learning_rate": 3.6042394160987756e-08, + "loss": 0.4978, + "step": 12552 + }, + { + "epoch": 0.97, + "grad_norm": 1.2810706966822991, + "learning_rate": 3.5829568301049e-08, + "loss": 0.473, + "step": 12553 + }, + { + "epoch": 0.97, + "grad_norm": 1.252853046312049, + "learning_rate": 3.561737153059741e-08, + "loss": 0.5371, + "step": 12554 + }, + { + "epoch": 0.97, + "grad_norm": 1.3147221781602605, + "learning_rate": 3.5405803863032274e-08, + "loss": 0.4885, + "step": 12555 + }, + { + "epoch": 0.97, + "grad_norm": 1.1985113685651188, + "learning_rate": 3.519486531170735e-08, + "loss": 0.5107, + "step": 12556 + }, + { + "epoch": 0.97, + "grad_norm": 1.2007917097203231, + "learning_rate": 3.4984555889944204e-08, + "loss": 0.4907, + "step": 12557 + }, + { + "epoch": 0.97, + "grad_norm": 1.3001402687545658, + "learning_rate": 3.477487561101778e-08, + "loss": 0.505, + "step": 12558 + }, + { + "epoch": 0.97, + "grad_norm": 1.2350823994997568, + "learning_rate": 3.4565824488166366e-08, + "loss": 0.5286, + "step": 12559 + }, + { + "epoch": 0.97, + "grad_norm": 1.16510285860732, + "learning_rate": 3.435740253459052e-08, + "loss": 0.5068, + "step": 12560 + }, + { + "epoch": 0.97, + "grad_norm": 1.14552285004849, + "learning_rate": 3.41496097634475e-08, + "loss": 0.4835, + "step": 12561 + }, + { + "epoch": 0.97, + "grad_norm": 1.308576486957611, + "learning_rate": 3.3942446187857915e-08, + "loss": 0.6124, + "step": 12562 + }, + { + "epoch": 0.97, + "grad_norm": 1.180858611807922, + "learning_rate": 3.373591182089797e-08, + "loss": 0.5003, + "step": 12563 + }, + { + "epoch": 0.97, + "grad_norm": 1.2974452867530155, + "learning_rate": 3.353000667560946e-08, + "loss": 0.4998, + "step": 12564 + }, + { + "epoch": 0.97, + "grad_norm": 1.1624007172386073, + "learning_rate": 3.3324730764991985e-08, + "loss": 0.4966, + "step": 12565 + }, + { + "epoch": 0.97, + "grad_norm": 1.1742607246251906, + "learning_rate": 3.312008410200518e-08, + "loss": 0.4783, + "step": 12566 + }, + { + "epoch": 0.97, + "grad_norm": 1.3040481645576383, + "learning_rate": 3.2916066699570926e-08, + "loss": 0.5205, + "step": 12567 + }, + { + "epoch": 0.98, + "grad_norm": 1.083866092311369, + "learning_rate": 3.271267857056781e-08, + "loss": 0.5162, + "step": 12568 + }, + { + "epoch": 0.98, + "grad_norm": 1.1746480556724765, + "learning_rate": 3.250991972783779e-08, + "loss": 0.449, + "step": 12569 + }, + { + "epoch": 0.98, + "grad_norm": 1.3303790095242969, + "learning_rate": 3.230779018418284e-08, + "loss": 0.5377, + "step": 12570 + }, + { + "epoch": 0.98, + "grad_norm": 1.2407941861421647, + "learning_rate": 3.210628995236276e-08, + "loss": 0.5397, + "step": 12571 + }, + { + "epoch": 0.98, + "grad_norm": 1.0852404765651726, + "learning_rate": 3.190541904510069e-08, + "loss": 0.4063, + "step": 12572 + }, + { + "epoch": 0.98, + "grad_norm": 1.2172157705449325, + "learning_rate": 3.170517747507762e-08, + "loss": 0.4855, + "step": 12573 + }, + { + "epoch": 0.98, + "grad_norm": 1.1867336008575373, + "learning_rate": 3.1505565254936755e-08, + "loss": 0.5503, + "step": 12574 + }, + { + "epoch": 0.98, + "grad_norm": 1.2104968838310863, + "learning_rate": 3.130658239728024e-08, + "loss": 0.5343, + "step": 12575 + }, + { + "epoch": 0.98, + "grad_norm": 1.0946370478706835, + "learning_rate": 3.1108228914670245e-08, + "loss": 0.4572, + "step": 12576 + }, + { + "epoch": 0.98, + "grad_norm": 1.2518267634712072, + "learning_rate": 3.0910504819631205e-08, + "loss": 0.5355, + "step": 12577 + }, + { + "epoch": 0.98, + "grad_norm": 1.2287984986304756, + "learning_rate": 3.071341012464535e-08, + "loss": 0.5213, + "step": 12578 + }, + { + "epoch": 0.98, + "grad_norm": 1.3161426585916085, + "learning_rate": 3.051694484215717e-08, + "loss": 0.5149, + "step": 12579 + }, + { + "epoch": 0.98, + "grad_norm": 1.2385400447508659, + "learning_rate": 3.032110898457008e-08, + "loss": 0.4832, + "step": 12580 + }, + { + "epoch": 0.98, + "grad_norm": 1.2257748644378619, + "learning_rate": 3.012590256424752e-08, + "loss": 0.5006, + "step": 12581 + }, + { + "epoch": 0.98, + "grad_norm": 1.2571520759736823, + "learning_rate": 2.993132559351519e-08, + "loss": 0.5157, + "step": 12582 + }, + { + "epoch": 0.98, + "grad_norm": 1.3422654845349395, + "learning_rate": 2.9737378084656597e-08, + "loss": 0.4989, + "step": 12583 + }, + { + "epoch": 0.98, + "grad_norm": 1.1401826183547275, + "learning_rate": 2.954406004991639e-08, + "loss": 0.475, + "step": 12584 + }, + { + "epoch": 0.98, + "grad_norm": 1.20654203012724, + "learning_rate": 2.935137150150147e-08, + "loss": 0.4997, + "step": 12585 + }, + { + "epoch": 0.98, + "grad_norm": 1.1993375837364884, + "learning_rate": 2.915931245157544e-08, + "loss": 0.5235, + "step": 12586 + }, + { + "epoch": 0.98, + "grad_norm": 1.154345051491068, + "learning_rate": 2.8967882912265265e-08, + "loss": 0.4759, + "step": 12587 + }, + { + "epoch": 0.98, + "grad_norm": 1.212403769459618, + "learning_rate": 2.8777082895656837e-08, + "loss": 0.4856, + "step": 12588 + }, + { + "epoch": 0.98, + "grad_norm": 1.2418752793732681, + "learning_rate": 2.8586912413794966e-08, + "loss": 0.5041, + "step": 12589 + }, + { + "epoch": 0.98, + "grad_norm": 1.3622836859940326, + "learning_rate": 2.8397371478687818e-08, + "loss": 0.5332, + "step": 12590 + }, + { + "epoch": 0.98, + "grad_norm": 1.1319309356028986, + "learning_rate": 2.820846010230138e-08, + "loss": 0.4785, + "step": 12591 + }, + { + "epoch": 0.98, + "grad_norm": 1.2693726895422595, + "learning_rate": 2.8020178296562784e-08, + "loss": 0.5224, + "step": 12592 + }, + { + "epoch": 0.98, + "grad_norm": 1.3295500346937035, + "learning_rate": 2.783252607335807e-08, + "loss": 0.5083, + "step": 12593 + }, + { + "epoch": 0.98, + "grad_norm": 1.1587727321629475, + "learning_rate": 2.764550344453554e-08, + "loss": 0.507, + "step": 12594 + }, + { + "epoch": 0.98, + "grad_norm": 1.1748750052015278, + "learning_rate": 2.7459110421903524e-08, + "loss": 0.5074, + "step": 12595 + }, + { + "epoch": 0.98, + "grad_norm": 1.128433709424777, + "learning_rate": 2.727334701722928e-08, + "loss": 0.4599, + "step": 12596 + }, + { + "epoch": 0.98, + "grad_norm": 1.2057224325674472, + "learning_rate": 2.70882132422412e-08, + "loss": 0.4755, + "step": 12597 + }, + { + "epoch": 0.98, + "grad_norm": 1.1270810588501117, + "learning_rate": 2.6903709108627718e-08, + "loss": 0.4236, + "step": 12598 + }, + { + "epoch": 0.98, + "grad_norm": 1.3004548634194812, + "learning_rate": 2.6719834628037287e-08, + "loss": 0.5096, + "step": 12599 + }, + { + "epoch": 0.98, + "grad_norm": 1.3358761065023312, + "learning_rate": 2.6536589812079517e-08, + "loss": 0.5493, + "step": 12600 + }, + { + "epoch": 0.98, + "grad_norm": 1.0738701827054769, + "learning_rate": 2.6353974672322923e-08, + "loss": 0.482, + "step": 12601 + }, + { + "epoch": 0.98, + "grad_norm": 1.2054185680097944, + "learning_rate": 2.6171989220297177e-08, + "loss": 0.4911, + "step": 12602 + }, + { + "epoch": 0.98, + "grad_norm": 1.2355684233290045, + "learning_rate": 2.5990633467491976e-08, + "loss": 0.4885, + "step": 12603 + }, + { + "epoch": 0.98, + "grad_norm": 1.119647272544111, + "learning_rate": 2.580990742535705e-08, + "loss": 0.4366, + "step": 12604 + }, + { + "epoch": 0.98, + "grad_norm": 1.04430067189396, + "learning_rate": 2.562981110530216e-08, + "loss": 0.463, + "step": 12605 + }, + { + "epoch": 0.98, + "grad_norm": 1.1836326265402604, + "learning_rate": 2.545034451869821e-08, + "loss": 0.5025, + "step": 12606 + }, + { + "epoch": 0.98, + "grad_norm": 1.3094176019188473, + "learning_rate": 2.5271507676877248e-08, + "loss": 0.5762, + "step": 12607 + }, + { + "epoch": 0.98, + "grad_norm": 1.229317068607476, + "learning_rate": 2.5093300591128023e-08, + "loss": 0.5105, + "step": 12608 + }, + { + "epoch": 0.98, + "grad_norm": 1.1557713626254547, + "learning_rate": 2.4915723272702642e-08, + "loss": 0.4225, + "step": 12609 + }, + { + "epoch": 0.98, + "grad_norm": 1.2488192226556767, + "learning_rate": 2.4738775732812138e-08, + "loss": 0.4733, + "step": 12610 + }, + { + "epoch": 0.98, + "grad_norm": 1.272525765208751, + "learning_rate": 2.4562457982628683e-08, + "loss": 0.5141, + "step": 12611 + }, + { + "epoch": 0.98, + "grad_norm": 1.196111075726186, + "learning_rate": 2.438677003328338e-08, + "loss": 0.5235, + "step": 12612 + }, + { + "epoch": 0.98, + "grad_norm": 1.1794870976886063, + "learning_rate": 2.4211711895868462e-08, + "loss": 0.4887, + "step": 12613 + }, + { + "epoch": 0.98, + "grad_norm": 1.168045653806355, + "learning_rate": 2.403728358143731e-08, + "loss": 0.45, + "step": 12614 + }, + { + "epoch": 0.98, + "grad_norm": 1.1773530206650702, + "learning_rate": 2.3863485101001114e-08, + "loss": 0.483, + "step": 12615 + }, + { + "epoch": 0.98, + "grad_norm": 1.14599396128836, + "learning_rate": 2.3690316465533325e-08, + "loss": 0.4959, + "step": 12616 + }, + { + "epoch": 0.98, + "grad_norm": 1.252387056416989, + "learning_rate": 2.3517777685966305e-08, + "loss": 0.5047, + "step": 12617 + }, + { + "epoch": 0.98, + "grad_norm": 1.2420292879512145, + "learning_rate": 2.334586877319467e-08, + "loss": 0.5223, + "step": 12618 + }, + { + "epoch": 0.98, + "grad_norm": 1.1678464181173436, + "learning_rate": 2.3174589738070853e-08, + "loss": 0.4807, + "step": 12619 + }, + { + "epoch": 0.98, + "grad_norm": 1.2828047088869678, + "learning_rate": 2.3003940591408425e-08, + "loss": 0.4784, + "step": 12620 + }, + { + "epoch": 0.98, + "grad_norm": 1.144458911679828, + "learning_rate": 2.28339213439821e-08, + "loss": 0.5012, + "step": 12621 + }, + { + "epoch": 0.98, + "grad_norm": 1.2144344903860433, + "learning_rate": 2.266453200652552e-08, + "loss": 0.4906, + "step": 12622 + }, + { + "epoch": 0.98, + "grad_norm": 1.3063654816608978, + "learning_rate": 2.2495772589733456e-08, + "loss": 0.5064, + "step": 12623 + }, + { + "epoch": 0.98, + "grad_norm": 1.2055968122019955, + "learning_rate": 2.232764310426072e-08, + "loss": 0.5038, + "step": 12624 + }, + { + "epoch": 0.98, + "grad_norm": 1.242079244250955, + "learning_rate": 2.2160143560721048e-08, + "loss": 0.4767, + "step": 12625 + }, + { + "epoch": 0.98, + "grad_norm": 1.3478167009896238, + "learning_rate": 2.1993273969691532e-08, + "loss": 0.5037, + "step": 12626 + }, + { + "epoch": 0.98, + "grad_norm": 1.0968958165293736, + "learning_rate": 2.1827034341704855e-08, + "loss": 0.4541, + "step": 12627 + }, + { + "epoch": 0.98, + "grad_norm": 1.2237881427154937, + "learning_rate": 2.166142468725929e-08, + "loss": 0.5013, + "step": 12628 + }, + { + "epoch": 0.98, + "grad_norm": 1.1549132068440089, + "learning_rate": 2.1496445016809808e-08, + "loss": 0.4535, + "step": 12629 + }, + { + "epoch": 0.98, + "grad_norm": 1.1663659877688526, + "learning_rate": 2.133209534077141e-08, + "loss": 0.4821, + "step": 12630 + }, + { + "epoch": 0.98, + "grad_norm": 1.3071388013406051, + "learning_rate": 2.1168375669521346e-08, + "loss": 0.5582, + "step": 12631 + }, + { + "epoch": 0.98, + "grad_norm": 1.2073892599468885, + "learning_rate": 2.1005286013394688e-08, + "loss": 0.4629, + "step": 12632 + }, + { + "epoch": 0.98, + "grad_norm": 1.2083539906193934, + "learning_rate": 2.0842826382689864e-08, + "loss": 0.4557, + "step": 12633 + }, + { + "epoch": 0.98, + "grad_norm": 1.154444806064512, + "learning_rate": 2.068099678766311e-08, + "loss": 0.4534, + "step": 12634 + }, + { + "epoch": 0.98, + "grad_norm": 1.2691488306232885, + "learning_rate": 2.0519797238531813e-08, + "loss": 0.5131, + "step": 12635 + }, + { + "epoch": 0.98, + "grad_norm": 1.1546102701874512, + "learning_rate": 2.0359227745472278e-08, + "loss": 0.4635, + "step": 12636 + }, + { + "epoch": 0.98, + "grad_norm": 1.22282886200742, + "learning_rate": 2.0199288318624165e-08, + "loss": 0.5263, + "step": 12637 + }, + { + "epoch": 0.98, + "grad_norm": 1.2637600757412284, + "learning_rate": 2.003997896808274e-08, + "loss": 0.513, + "step": 12638 + }, + { + "epoch": 0.98, + "grad_norm": 1.3187990260841724, + "learning_rate": 1.988129970390773e-08, + "loss": 0.4691, + "step": 12639 + }, + { + "epoch": 0.98, + "grad_norm": 1.220886363200298, + "learning_rate": 1.972325053611779e-08, + "loss": 0.4486, + "step": 12640 + }, + { + "epoch": 0.98, + "grad_norm": 1.1272970696167466, + "learning_rate": 1.9565831474689377e-08, + "loss": 0.5075, + "step": 12641 + }, + { + "epoch": 0.98, + "grad_norm": 1.2195276078262047, + "learning_rate": 1.9409042529562327e-08, + "loss": 0.4991, + "step": 12642 + }, + { + "epoch": 0.98, + "grad_norm": 1.165288548324413, + "learning_rate": 1.9252883710635383e-08, + "loss": 0.5002, + "step": 12643 + }, + { + "epoch": 0.98, + "grad_norm": 1.257544163104745, + "learning_rate": 1.909735502776844e-08, + "loss": 0.563, + "step": 12644 + }, + { + "epoch": 0.98, + "grad_norm": 1.1696889196413247, + "learning_rate": 1.8942456490780305e-08, + "loss": 0.4523, + "step": 12645 + }, + { + "epoch": 0.98, + "grad_norm": 1.3571724861538996, + "learning_rate": 1.8788188109449822e-08, + "loss": 0.5102, + "step": 12646 + }, + { + "epoch": 0.98, + "grad_norm": 1.232620247474563, + "learning_rate": 1.8634549893516983e-08, + "loss": 0.4757, + "step": 12647 + }, + { + "epoch": 0.98, + "grad_norm": 1.3154257453570524, + "learning_rate": 1.8481541852682917e-08, + "loss": 0.5532, + "step": 12648 + }, + { + "epoch": 0.98, + "grad_norm": 1.1616027240495295, + "learning_rate": 1.832916399660656e-08, + "loss": 0.4633, + "step": 12649 + }, + { + "epoch": 0.98, + "grad_norm": 1.192816877909245, + "learning_rate": 1.8177416334907995e-08, + "loss": 0.5134, + "step": 12650 + }, + { + "epoch": 0.98, + "grad_norm": 1.1525956218050522, + "learning_rate": 1.8026298877169557e-08, + "loss": 0.4664, + "step": 12651 + }, + { + "epoch": 0.98, + "grad_norm": 1.1901307329124517, + "learning_rate": 1.7875811632930285e-08, + "loss": 0.4796, + "step": 12652 + }, + { + "epoch": 0.98, + "grad_norm": 1.1985796489738958, + "learning_rate": 1.7725954611692577e-08, + "loss": 0.4977, + "step": 12653 + }, + { + "epoch": 0.98, + "grad_norm": 1.2222830108254599, + "learning_rate": 1.757672782291775e-08, + "loss": 0.4819, + "step": 12654 + }, + { + "epoch": 0.98, + "grad_norm": 1.142257097579135, + "learning_rate": 1.742813127602605e-08, + "loss": 0.5035, + "step": 12655 + }, + { + "epoch": 0.98, + "grad_norm": 1.11314546526645, + "learning_rate": 1.728016498039886e-08, + "loss": 0.4671, + "step": 12656 + }, + { + "epoch": 0.98, + "grad_norm": 1.2919311509019973, + "learning_rate": 1.713282894537982e-08, + "loss": 0.5232, + "step": 12657 + }, + { + "epoch": 0.98, + "grad_norm": 1.1539355646850773, + "learning_rate": 1.6986123180270376e-08, + "loss": 0.4443, + "step": 12658 + }, + { + "epoch": 0.98, + "grad_norm": 1.2444507924884904, + "learning_rate": 1.6840047694332007e-08, + "loss": 0.4721, + "step": 12659 + }, + { + "epoch": 0.98, + "grad_norm": 1.2600276704050837, + "learning_rate": 1.6694602496788447e-08, + "loss": 0.481, + "step": 12660 + }, + { + "epoch": 0.98, + "grad_norm": 1.1196517975268374, + "learning_rate": 1.6549787596821242e-08, + "loss": 0.4641, + "step": 12661 + }, + { + "epoch": 0.98, + "grad_norm": 1.2072081698534942, + "learning_rate": 1.640560300357308e-08, + "loss": 0.5326, + "step": 12662 + }, + { + "epoch": 0.98, + "grad_norm": 1.2229784946094853, + "learning_rate": 1.6262048726148894e-08, + "loss": 0.5128, + "step": 12663 + }, + { + "epoch": 0.98, + "grad_norm": 1.297595134400607, + "learning_rate": 1.6119124773610328e-08, + "loss": 0.4879, + "step": 12664 + }, + { + "epoch": 0.98, + "grad_norm": 1.3088338147772178, + "learning_rate": 1.5976831154981275e-08, + "loss": 0.4948, + "step": 12665 + }, + { + "epoch": 0.98, + "grad_norm": 1.2027569988287352, + "learning_rate": 1.583516787924566e-08, + "loss": 0.4992, + "step": 12666 + }, + { + "epoch": 0.98, + "grad_norm": 1.211887946450703, + "learning_rate": 1.569413495534744e-08, + "loss": 0.5026, + "step": 12667 + }, + { + "epoch": 0.98, + "grad_norm": 1.212820299846413, + "learning_rate": 1.5553732392191712e-08, + "loss": 0.5573, + "step": 12668 + }, + { + "epoch": 0.98, + "grad_norm": 1.1885641750619365, + "learning_rate": 1.5413960198641388e-08, + "loss": 0.511, + "step": 12669 + }, + { + "epoch": 0.98, + "grad_norm": 1.283391140383113, + "learning_rate": 1.527481838352052e-08, + "loss": 0.525, + "step": 12670 + }, + { + "epoch": 0.98, + "grad_norm": 1.1577687470152433, + "learning_rate": 1.513630695561541e-08, + "loss": 0.5019, + "step": 12671 + }, + { + "epoch": 0.98, + "grad_norm": 1.1802811802432251, + "learning_rate": 1.499842592367018e-08, + "loss": 0.4751, + "step": 12672 + }, + { + "epoch": 0.98, + "grad_norm": 1.184738763592111, + "learning_rate": 1.4861175296390084e-08, + "loss": 0.4256, + "step": 12673 + }, + { + "epoch": 0.98, + "grad_norm": 1.1066242554783943, + "learning_rate": 1.4724555082441528e-08, + "loss": 0.4851, + "step": 12674 + }, + { + "epoch": 0.98, + "grad_norm": 1.2692062666224042, + "learning_rate": 1.458856529044872e-08, + "loss": 0.5065, + "step": 12675 + }, + { + "epoch": 0.98, + "grad_norm": 1.233754738563783, + "learning_rate": 1.4453205928997016e-08, + "loss": 0.5212, + "step": 12676 + }, + { + "epoch": 0.98, + "grad_norm": 1.2134549460580195, + "learning_rate": 1.4318477006632914e-08, + "loss": 0.5174, + "step": 12677 + }, + { + "epoch": 0.98, + "grad_norm": 1.1839502886601159, + "learning_rate": 1.418437853186294e-08, + "loss": 0.488, + "step": 12678 + }, + { + "epoch": 0.98, + "grad_norm": 1.243352283933351, + "learning_rate": 1.405091051315366e-08, + "loss": 0.4615, + "step": 12679 + }, + { + "epoch": 0.98, + "grad_norm": 1.2655166297009866, + "learning_rate": 1.3918072958931662e-08, + "loss": 0.4869, + "step": 12680 + }, + { + "epoch": 0.98, + "grad_norm": 1.2254379851869046, + "learning_rate": 1.3785865877581351e-08, + "loss": 0.4764, + "step": 12681 + }, + { + "epoch": 0.98, + "grad_norm": 1.2155500609533092, + "learning_rate": 1.3654289277452715e-08, + "loss": 0.4928, + "step": 12682 + }, + { + "epoch": 0.98, + "grad_norm": 1.2407765562561746, + "learning_rate": 1.3523343166851332e-08, + "loss": 0.5097, + "step": 12683 + }, + { + "epoch": 0.98, + "grad_norm": 1.2287690548665986, + "learning_rate": 1.3393027554045035e-08, + "loss": 0.4988, + "step": 12684 + }, + { + "epoch": 0.98, + "grad_norm": 1.2577501364465047, + "learning_rate": 1.3263342447260575e-08, + "loss": 0.455, + "step": 12685 + }, + { + "epoch": 0.98, + "grad_norm": 1.304038154599678, + "learning_rate": 1.3134287854685846e-08, + "loss": 0.4923, + "step": 12686 + }, + { + "epoch": 0.98, + "grad_norm": 1.106439602223515, + "learning_rate": 1.3005863784468774e-08, + "loss": 0.4866, + "step": 12687 + }, + { + "epoch": 0.98, + "grad_norm": 1.2633800611854116, + "learning_rate": 1.2878070244718433e-08, + "loss": 0.5417, + "step": 12688 + }, + { + "epoch": 0.98, + "grad_norm": 1.186966776340299, + "learning_rate": 1.2750907243501698e-08, + "loss": 0.4708, + "step": 12689 + }, + { + "epoch": 0.98, + "grad_norm": 1.2494317193488416, + "learning_rate": 1.2624374788848814e-08, + "loss": 0.4961, + "step": 12690 + }, + { + "epoch": 0.98, + "grad_norm": 1.3156602596532554, + "learning_rate": 1.2498472888745616e-08, + "loss": 0.4772, + "step": 12691 + }, + { + "epoch": 0.98, + "grad_norm": 1.2698992386592534, + "learning_rate": 1.2373201551143521e-08, + "loss": 0.5202, + "step": 12692 + }, + { + "epoch": 0.98, + "grad_norm": 1.1804663001084925, + "learning_rate": 1.2248560783950646e-08, + "loss": 0.5111, + "step": 12693 + }, + { + "epoch": 0.98, + "grad_norm": 1.2526284239938883, + "learning_rate": 1.2124550595036255e-08, + "loss": 0.5367, + "step": 12694 + }, + { + "epoch": 0.98, + "grad_norm": 1.2306195363494494, + "learning_rate": 1.2001170992228528e-08, + "loss": 0.4877, + "step": 12695 + }, + { + "epoch": 0.98, + "grad_norm": 1.177407746072349, + "learning_rate": 1.187842198331901e-08, + "loss": 0.5063, + "step": 12696 + }, + { + "epoch": 0.99, + "grad_norm": 1.181903248756463, + "learning_rate": 1.175630357605706e-08, + "loss": 0.4957, + "step": 12697 + }, + { + "epoch": 0.99, + "grad_norm": 1.109665523688508, + "learning_rate": 1.1634815778150954e-08, + "loss": 0.5012, + "step": 12698 + }, + { + "epoch": 0.99, + "grad_norm": 1.1468266198870443, + "learning_rate": 1.1513958597273445e-08, + "loss": 0.4636, + "step": 12699 + }, + { + "epoch": 0.99, + "grad_norm": 1.3306020637423956, + "learning_rate": 1.1393732041052874e-08, + "loss": 0.5574, + "step": 12700 + }, + { + "epoch": 0.99, + "grad_norm": 1.231717593095607, + "learning_rate": 1.1274136117080946e-08, + "loss": 0.501, + "step": 12701 + }, + { + "epoch": 0.99, + "grad_norm": 1.2513176172452023, + "learning_rate": 1.115517083290718e-08, + "loss": 0.5012, + "step": 12702 + }, + { + "epoch": 0.99, + "grad_norm": 1.1676995723145023, + "learning_rate": 1.1036836196043344e-08, + "loss": 0.496, + "step": 12703 + }, + { + "epoch": 0.99, + "grad_norm": 1.2389142557482795, + "learning_rate": 1.0919132213960126e-08, + "loss": 0.5206, + "step": 12704 + }, + { + "epoch": 0.99, + "grad_norm": 1.1331752327895728, + "learning_rate": 1.0802058894089363e-08, + "loss": 0.4722, + "step": 12705 + }, + { + "epoch": 0.99, + "grad_norm": 1.1956660925767462, + "learning_rate": 1.0685616243821806e-08, + "loss": 0.4979, + "step": 12706 + }, + { + "epoch": 0.99, + "grad_norm": 1.2624269747623686, + "learning_rate": 1.0569804270509354e-08, + "loss": 0.5209, + "step": 12707 + }, + { + "epoch": 0.99, + "grad_norm": 1.2372739527617718, + "learning_rate": 1.0454622981463935e-08, + "loss": 0.5085, + "step": 12708 + }, + { + "epoch": 0.99, + "grad_norm": 1.4619405638306442, + "learning_rate": 1.034007238395751e-08, + "loss": 0.5298, + "step": 12709 + }, + { + "epoch": 0.99, + "grad_norm": 1.2525100778414118, + "learning_rate": 1.0226152485222073e-08, + "loss": 0.4629, + "step": 12710 + }, + { + "epoch": 0.99, + "grad_norm": 1.2862428286457241, + "learning_rate": 1.0112863292450758e-08, + "loss": 0.5394, + "step": 12711 + }, + { + "epoch": 0.99, + "grad_norm": 1.0701279864016118, + "learning_rate": 1.0000204812794511e-08, + "loss": 0.4706, + "step": 12712 + }, + { + "epoch": 0.99, + "grad_norm": 1.163185608610436, + "learning_rate": 9.888177053367642e-09, + "loss": 0.4616, + "step": 12713 + }, + { + "epoch": 0.99, + "grad_norm": 1.2559422395934243, + "learning_rate": 9.776780021241161e-09, + "loss": 0.524, + "step": 12714 + }, + { + "epoch": 0.99, + "grad_norm": 1.1807718409210963, + "learning_rate": 9.666013723450552e-09, + "loss": 0.4642, + "step": 12715 + }, + { + "epoch": 0.99, + "grad_norm": 1.314801106296906, + "learning_rate": 9.555878166987998e-09, + "loss": 0.525, + "step": 12716 + }, + { + "epoch": 0.99, + "grad_norm": 1.2418551210253237, + "learning_rate": 9.446373358805716e-09, + "loss": 0.5123, + "step": 12717 + }, + { + "epoch": 0.99, + "grad_norm": 1.2614900836471672, + "learning_rate": 9.337499305819287e-09, + "loss": 0.499, + "step": 12718 + }, + { + "epoch": 0.99, + "grad_norm": 1.4273683676348017, + "learning_rate": 9.229256014900989e-09, + "loss": 0.5802, + "step": 12719 + }, + { + "epoch": 0.99, + "grad_norm": 1.1766008327769735, + "learning_rate": 9.121643492885358e-09, + "loss": 0.4721, + "step": 12720 + }, + { + "epoch": 0.99, + "grad_norm": 1.1745903599693428, + "learning_rate": 9.014661746566954e-09, + "loss": 0.4524, + "step": 12721 + }, + { + "epoch": 0.99, + "grad_norm": 1.348019139179454, + "learning_rate": 8.90831078269927e-09, + "loss": 0.5121, + "step": 12722 + }, + { + "epoch": 0.99, + "grad_norm": 1.301335809390326, + "learning_rate": 8.80259060799804e-09, + "loss": 0.5051, + "step": 12723 + }, + { + "epoch": 0.99, + "grad_norm": 1.1327991353266051, + "learning_rate": 8.697501229135708e-09, + "loss": 0.4515, + "step": 12724 + }, + { + "epoch": 0.99, + "grad_norm": 1.1477310706795962, + "learning_rate": 8.593042652749184e-09, + "loss": 0.4624, + "step": 12725 + }, + { + "epoch": 0.99, + "grad_norm": 1.3856188213122878, + "learning_rate": 8.489214885433195e-09, + "loss": 0.5233, + "step": 12726 + }, + { + "epoch": 0.99, + "grad_norm": 1.2141359972216665, + "learning_rate": 8.386017933741386e-09, + "loss": 0.4808, + "step": 12727 + }, + { + "epoch": 0.99, + "grad_norm": 1.2309942183122504, + "learning_rate": 8.283451804190767e-09, + "loss": 0.5559, + "step": 12728 + }, + { + "epoch": 0.99, + "grad_norm": 1.1809277614687603, + "learning_rate": 8.18151650325616e-09, + "loss": 0.4801, + "step": 12729 + }, + { + "epoch": 0.99, + "grad_norm": 1.3661456252677353, + "learning_rate": 8.080212037374636e-09, + "loss": 0.54, + "step": 12730 + }, + { + "epoch": 0.99, + "grad_norm": 1.2866423111094476, + "learning_rate": 7.97953841293997e-09, + "loss": 0.5094, + "step": 12731 + }, + { + "epoch": 0.99, + "grad_norm": 1.2295635951048012, + "learning_rate": 7.879495636308188e-09, + "loss": 0.5078, + "step": 12732 + }, + { + "epoch": 0.99, + "grad_norm": 1.2228714141184835, + "learning_rate": 7.78008371379757e-09, + "loss": 0.4787, + "step": 12733 + }, + { + "epoch": 0.99, + "grad_norm": 1.2432130622646405, + "learning_rate": 7.681302651683098e-09, + "loss": 0.5176, + "step": 12734 + }, + { + "epoch": 0.99, + "grad_norm": 1.2042672815934001, + "learning_rate": 7.583152456200892e-09, + "loss": 0.4908, + "step": 12735 + }, + { + "epoch": 0.99, + "grad_norm": 1.2644028590890768, + "learning_rate": 7.485633133549331e-09, + "loss": 0.5024, + "step": 12736 + }, + { + "epoch": 0.99, + "grad_norm": 1.2938829860431587, + "learning_rate": 7.3887446898834865e-09, + "loss": 0.4993, + "step": 12737 + }, + { + "epoch": 0.99, + "grad_norm": 1.3347526784106538, + "learning_rate": 7.292487131321802e-09, + "loss": 0.5021, + "step": 12738 + }, + { + "epoch": 0.99, + "grad_norm": 1.1829435873019603, + "learning_rate": 7.1968604639416354e-09, + "loss": 0.4752, + "step": 12739 + }, + { + "epoch": 0.99, + "grad_norm": 1.2377566119691414, + "learning_rate": 7.1018646937781596e-09, + "loss": 0.4571, + "step": 12740 + }, + { + "epoch": 0.99, + "grad_norm": 1.228612316796155, + "learning_rate": 7.007499826832132e-09, + "loss": 0.4611, + "step": 12741 + }, + { + "epoch": 0.99, + "grad_norm": 1.2366804009743393, + "learning_rate": 6.913765869058786e-09, + "loss": 0.4754, + "step": 12742 + }, + { + "epoch": 0.99, + "grad_norm": 1.182124973370936, + "learning_rate": 6.820662826376723e-09, + "loss": 0.5172, + "step": 12743 + }, + { + "epoch": 0.99, + "grad_norm": 1.2616431624030153, + "learning_rate": 6.728190704664572e-09, + "loss": 0.4868, + "step": 12744 + }, + { + "epoch": 0.99, + "grad_norm": 1.230429754698932, + "learning_rate": 6.636349509760997e-09, + "loss": 0.5769, + "step": 12745 + }, + { + "epoch": 0.99, + "grad_norm": 1.1692607457236275, + "learning_rate": 6.545139247462473e-09, + "loss": 0.4374, + "step": 12746 + }, + { + "epoch": 0.99, + "grad_norm": 1.1358010694175988, + "learning_rate": 6.454559923529946e-09, + "loss": 0.4582, + "step": 12747 + }, + { + "epoch": 0.99, + "grad_norm": 1.2873372873245224, + "learning_rate": 6.364611543679955e-09, + "loss": 0.5132, + "step": 12748 + }, + { + "epoch": 0.99, + "grad_norm": 1.331729172268932, + "learning_rate": 6.275294113592401e-09, + "loss": 0.535, + "step": 12749 + }, + { + "epoch": 0.99, + "grad_norm": 1.1422743695123125, + "learning_rate": 6.186607638907216e-09, + "loss": 0.4593, + "step": 12750 + }, + { + "epoch": 0.99, + "grad_norm": 1.2191683222476324, + "learning_rate": 6.098552125222146e-09, + "loss": 0.516, + "step": 12751 + }, + { + "epoch": 0.99, + "grad_norm": 1.1540944394717993, + "learning_rate": 6.0111275780971865e-09, + "loss": 0.4492, + "step": 12752 + }, + { + "epoch": 0.99, + "grad_norm": 1.2025172692722335, + "learning_rate": 5.924334003052368e-09, + "loss": 0.5226, + "step": 12753 + }, + { + "epoch": 0.99, + "grad_norm": 1.2681666625780705, + "learning_rate": 5.838171405566639e-09, + "loss": 0.5018, + "step": 12754 + }, + { + "epoch": 0.99, + "grad_norm": 1.0935665521729219, + "learning_rate": 5.752639791080095e-09, + "loss": 0.4704, + "step": 12755 + }, + { + "epoch": 0.99, + "grad_norm": 1.1377924930832464, + "learning_rate": 5.667739164993968e-09, + "loss": 0.4579, + "step": 12756 + }, + { + "epoch": 0.99, + "grad_norm": 1.3449772398203494, + "learning_rate": 5.583469532666197e-09, + "loss": 0.4832, + "step": 12757 + }, + { + "epoch": 0.99, + "grad_norm": 1.1875041660436534, + "learning_rate": 5.49983089941919e-09, + "loss": 0.4417, + "step": 12758 + }, + { + "epoch": 0.99, + "grad_norm": 1.1745423034253408, + "learning_rate": 5.416823270532057e-09, + "loss": 0.495, + "step": 12759 + }, + { + "epoch": 0.99, + "grad_norm": 1.2282987806532002, + "learning_rate": 5.334446651246161e-09, + "loss": 0.487, + "step": 12760 + }, + { + "epoch": 0.99, + "grad_norm": 1.342449889997135, + "learning_rate": 5.252701046762898e-09, + "loss": 0.5254, + "step": 12761 + }, + { + "epoch": 0.99, + "grad_norm": 1.2715543619281244, + "learning_rate": 5.171586462242584e-09, + "loss": 0.4865, + "step": 12762 + }, + { + "epoch": 0.99, + "grad_norm": 1.094478745872354, + "learning_rate": 5.091102902806677e-09, + "loss": 0.4544, + "step": 12763 + }, + { + "epoch": 0.99, + "grad_norm": 1.3019438808787112, + "learning_rate": 5.011250373535559e-09, + "loss": 0.4927, + "step": 12764 + }, + { + "epoch": 0.99, + "grad_norm": 1.298966892258081, + "learning_rate": 4.932028879472972e-09, + "loss": 0.5164, + "step": 12765 + }, + { + "epoch": 0.99, + "grad_norm": 1.1492545406906582, + "learning_rate": 4.8534384256182506e-09, + "loss": 0.4609, + "step": 12766 + }, + { + "epoch": 0.99, + "grad_norm": 1.1557608936018389, + "learning_rate": 4.775479016934092e-09, + "loss": 0.5113, + "step": 12767 + }, + { + "epoch": 0.99, + "grad_norm": 1.0462360424901878, + "learning_rate": 4.698150658343226e-09, + "loss": 0.4544, + "step": 12768 + }, + { + "epoch": 0.99, + "grad_norm": 1.2507010877989286, + "learning_rate": 4.621453354726191e-09, + "loss": 0.5325, + "step": 12769 + }, + { + "epoch": 0.99, + "grad_norm": 1.2383028142838413, + "learning_rate": 4.545387110926891e-09, + "loss": 0.4787, + "step": 12770 + }, + { + "epoch": 0.99, + "grad_norm": 1.1392576827492746, + "learning_rate": 4.4699519317459305e-09, + "loss": 0.4477, + "step": 12771 + }, + { + "epoch": 0.99, + "grad_norm": 1.2822288402760853, + "learning_rate": 4.395147821948387e-09, + "loss": 0.5515, + "step": 12772 + }, + { + "epoch": 0.99, + "grad_norm": 1.2124480147868137, + "learning_rate": 4.320974786254928e-09, + "loss": 0.4641, + "step": 12773 + }, + { + "epoch": 0.99, + "grad_norm": 1.2341956962057914, + "learning_rate": 4.247432829349585e-09, + "loss": 0.5244, + "step": 12774 + }, + { + "epoch": 0.99, + "grad_norm": 1.2746961025762316, + "learning_rate": 4.17452195587531e-09, + "loss": 0.5264, + "step": 12775 + }, + { + "epoch": 0.99, + "grad_norm": 1.1431327842918724, + "learning_rate": 4.102242170435089e-09, + "loss": 0.4609, + "step": 12776 + }, + { + "epoch": 0.99, + "grad_norm": 1.2277499240105445, + "learning_rate": 4.030593477591938e-09, + "loss": 0.4699, + "step": 12777 + }, + { + "epoch": 0.99, + "grad_norm": 1.207457087684253, + "learning_rate": 3.959575881870015e-09, + "loss": 0.4785, + "step": 12778 + }, + { + "epoch": 0.99, + "grad_norm": 1.2098685684230215, + "learning_rate": 3.889189387752401e-09, + "loss": 0.477, + "step": 12779 + }, + { + "epoch": 0.99, + "grad_norm": 1.1490477413322115, + "learning_rate": 3.81943399968443e-09, + "loss": 0.4662, + "step": 12780 + }, + { + "epoch": 0.99, + "grad_norm": 1.15566826697599, + "learning_rate": 3.750309722068135e-09, + "loss": 0.4696, + "step": 12781 + }, + { + "epoch": 0.99, + "grad_norm": 1.1612448674346205, + "learning_rate": 3.6818165592689138e-09, + "loss": 0.4977, + "step": 12782 + }, + { + "epoch": 0.99, + "grad_norm": 1.32667039987304, + "learning_rate": 3.6139545156110845e-09, + "loss": 0.5464, + "step": 12783 + }, + { + "epoch": 0.99, + "grad_norm": 1.1690986296927226, + "learning_rate": 3.546723595378998e-09, + "loss": 0.5062, + "step": 12784 + }, + { + "epoch": 0.99, + "grad_norm": 1.242103237916961, + "learning_rate": 3.480123802817037e-09, + "loss": 0.4797, + "step": 12785 + }, + { + "epoch": 0.99, + "grad_norm": 1.1727203371959878, + "learning_rate": 3.4141551421296158e-09, + "loss": 0.5447, + "step": 12786 + }, + { + "epoch": 0.99, + "grad_norm": 1.2458725975168847, + "learning_rate": 3.348817617483402e-09, + "loss": 0.5028, + "step": 12787 + }, + { + "epoch": 0.99, + "grad_norm": 1.3535245644630611, + "learning_rate": 3.284111233000653e-09, + "loss": 0.5261, + "step": 12788 + }, + { + "epoch": 0.99, + "grad_norm": 1.2896032412979295, + "learning_rate": 3.2200359927692105e-09, + "loss": 0.476, + "step": 12789 + }, + { + "epoch": 0.99, + "grad_norm": 1.1653521261845836, + "learning_rate": 3.1565919008336164e-09, + "loss": 0.5147, + "step": 12790 + }, + { + "epoch": 0.99, + "grad_norm": 1.1364300023553127, + "learning_rate": 3.093778961199556e-09, + "loss": 0.4568, + "step": 12791 + }, + { + "epoch": 0.99, + "grad_norm": 1.212805506964488, + "learning_rate": 3.0315971778316354e-09, + "loss": 0.4889, + "step": 12792 + }, + { + "epoch": 0.99, + "grad_norm": 1.2480915760040547, + "learning_rate": 2.9700465546567136e-09, + "loss": 0.5756, + "step": 12793 + }, + { + "epoch": 0.99, + "grad_norm": 1.2684883881497169, + "learning_rate": 2.909127095560571e-09, + "loss": 0.5234, + "step": 12794 + }, + { + "epoch": 0.99, + "grad_norm": 1.1934599638905135, + "learning_rate": 2.8488388043901303e-09, + "loss": 0.475, + "step": 12795 + }, + { + "epoch": 0.99, + "grad_norm": 1.2552156355818238, + "learning_rate": 2.7891816849501264e-09, + "loss": 0.4996, + "step": 12796 + }, + { + "epoch": 0.99, + "grad_norm": 1.365215077625375, + "learning_rate": 2.7301557410086554e-09, + "loss": 0.5418, + "step": 12797 + }, + { + "epoch": 0.99, + "grad_norm": 1.1764671884866285, + "learning_rate": 2.671760976291626e-09, + "loss": 0.4622, + "step": 12798 + }, + { + "epoch": 0.99, + "grad_norm": 1.2295036285086227, + "learning_rate": 2.6139973944849795e-09, + "loss": 0.469, + "step": 12799 + }, + { + "epoch": 0.99, + "grad_norm": 1.1373493765571152, + "learning_rate": 2.556864999236908e-09, + "loss": 0.5109, + "step": 12800 + }, + { + "epoch": 0.99, + "grad_norm": 1.1620364382639425, + "learning_rate": 2.5003637941534153e-09, + "loss": 0.4847, + "step": 12801 + }, + { + "epoch": 0.99, + "grad_norm": 1.2873899764669479, + "learning_rate": 2.4444937828027592e-09, + "loss": 0.5283, + "step": 12802 + }, + { + "epoch": 0.99, + "grad_norm": 1.285990018455841, + "learning_rate": 2.389254968712118e-09, + "loss": 0.4971, + "step": 12803 + }, + { + "epoch": 0.99, + "grad_norm": 1.141489511282108, + "learning_rate": 2.334647355368702e-09, + "loss": 0.4104, + "step": 12804 + }, + { + "epoch": 0.99, + "grad_norm": 1.2558244430406191, + "learning_rate": 2.280670946219754e-09, + "loss": 0.4653, + "step": 12805 + }, + { + "epoch": 0.99, + "grad_norm": 1.2653039595715418, + "learning_rate": 2.2273257446736586e-09, + "loss": 0.4555, + "step": 12806 + }, + { + "epoch": 0.99, + "grad_norm": 1.212344625683869, + "learning_rate": 2.174611754097722e-09, + "loss": 0.5237, + "step": 12807 + }, + { + "epoch": 0.99, + "grad_norm": 1.325821033750812, + "learning_rate": 2.122528977821503e-09, + "loss": 0.5268, + "step": 12808 + }, + { + "epoch": 0.99, + "grad_norm": 1.2797011340932254, + "learning_rate": 2.071077419131262e-09, + "loss": 0.5498, + "step": 12809 + }, + { + "epoch": 0.99, + "grad_norm": 1.177253941462722, + "learning_rate": 2.0202570812777323e-09, + "loss": 0.4783, + "step": 12810 + }, + { + "epoch": 0.99, + "grad_norm": 1.234470846583571, + "learning_rate": 1.9700679674672373e-09, + "loss": 0.5146, + "step": 12811 + }, + { + "epoch": 0.99, + "grad_norm": 1.2242221737286103, + "learning_rate": 1.9205100808694644e-09, + "loss": 0.5137, + "step": 12812 + }, + { + "epoch": 0.99, + "grad_norm": 1.185292249607657, + "learning_rate": 1.8715834246130215e-09, + "loss": 0.5357, + "step": 12813 + }, + { + "epoch": 0.99, + "grad_norm": 1.2970198584854682, + "learning_rate": 1.8232880017876597e-09, + "loss": 0.5064, + "step": 12814 + }, + { + "epoch": 0.99, + "grad_norm": 1.283950936137628, + "learning_rate": 1.775623815440941e-09, + "loss": 0.5173, + "step": 12815 + }, + { + "epoch": 0.99, + "grad_norm": 1.1472506456238225, + "learning_rate": 1.7285908685837904e-09, + "loss": 0.4178, + "step": 12816 + }, + { + "epoch": 0.99, + "grad_norm": 1.1838934490613877, + "learning_rate": 1.6821891641860543e-09, + "loss": 0.4775, + "step": 12817 + }, + { + "epoch": 0.99, + "grad_norm": 1.3355287517736054, + "learning_rate": 1.636418705174281e-09, + "loss": 0.5806, + "step": 12818 + }, + { + "epoch": 0.99, + "grad_norm": 1.2837279943197735, + "learning_rate": 1.591279494441711e-09, + "loss": 0.5243, + "step": 12819 + }, + { + "epoch": 0.99, + "grad_norm": 1.258090871015429, + "learning_rate": 1.5467715348360668e-09, + "loss": 0.4976, + "step": 12820 + }, + { + "epoch": 0.99, + "grad_norm": 1.1104370191663984, + "learning_rate": 1.502894829167323e-09, + "loss": 0.4364, + "step": 12821 + }, + { + "epoch": 0.99, + "grad_norm": 1.2667553410527597, + "learning_rate": 1.459649380207706e-09, + "loss": 0.4724, + "step": 12822 + }, + { + "epoch": 0.99, + "grad_norm": 1.313096274307284, + "learning_rate": 1.4170351906850344e-09, + "loss": 0.537, + "step": 12823 + }, + { + "epoch": 0.99, + "grad_norm": 1.1776148290993629, + "learning_rate": 1.3750522632915986e-09, + "loss": 0.468, + "step": 12824 + }, + { + "epoch": 0.99, + "grad_norm": 1.4211429765678127, + "learning_rate": 1.3337006006763908e-09, + "loss": 0.5084, + "step": 12825 + }, + { + "epoch": 1.0, + "grad_norm": 1.0702948464378694, + "learning_rate": 1.292980205451766e-09, + "loss": 0.4506, + "step": 12826 + }, + { + "epoch": 1.0, + "grad_norm": 1.3529927618590143, + "learning_rate": 1.25289108018678e-09, + "loss": 0.5111, + "step": 12827 + }, + { + "epoch": 1.0, + "grad_norm": 1.3467819695085206, + "learning_rate": 1.2134332274149619e-09, + "loss": 0.5082, + "step": 12828 + }, + { + "epoch": 1.0, + "grad_norm": 1.1463209617389374, + "learning_rate": 1.1746066496243214e-09, + "loss": 0.4773, + "step": 12829 + }, + { + "epoch": 1.0, + "grad_norm": 1.2291410034000652, + "learning_rate": 1.1364113492695617e-09, + "loss": 0.4777, + "step": 12830 + }, + { + "epoch": 1.0, + "grad_norm": 1.1750131930970078, + "learning_rate": 1.0988473287598667e-09, + "loss": 0.521, + "step": 12831 + }, + { + "epoch": 1.0, + "grad_norm": 1.3122985321913427, + "learning_rate": 1.061914590467783e-09, + "loss": 0.5624, + "step": 12832 + }, + { + "epoch": 1.0, + "grad_norm": 1.193020936515233, + "learning_rate": 1.0256131367236688e-09, + "loss": 0.5322, + "step": 12833 + }, + { + "epoch": 1.0, + "grad_norm": 1.2552957412022692, + "learning_rate": 9.899429698212448e-10, + "loss": 0.4719, + "step": 12834 + }, + { + "epoch": 1.0, + "grad_norm": 1.1478150945312688, + "learning_rate": 9.549040920120433e-10, + "loss": 0.4432, + "step": 12835 + }, + { + "epoch": 1.0, + "grad_norm": 1.1489883451210987, + "learning_rate": 9.204965055076287e-10, + "loss": 0.49, + "step": 12836 + }, + { + "epoch": 1.0, + "grad_norm": 1.2036015941369709, + "learning_rate": 8.867202124818175e-10, + "loss": 0.4963, + "step": 12837 + }, + { + "epoch": 1.0, + "grad_norm": 1.1304358445277378, + "learning_rate": 8.535752150651277e-10, + "loss": 0.4901, + "step": 12838 + }, + { + "epoch": 1.0, + "grad_norm": 1.2300702130816425, + "learning_rate": 8.210615153503299e-10, + "loss": 0.5489, + "step": 12839 + }, + { + "epoch": 1.0, + "grad_norm": 1.1910347781544575, + "learning_rate": 7.891791153924466e-10, + "loss": 0.5424, + "step": 12840 + }, + { + "epoch": 1.0, + "grad_norm": 1.1025158159155233, + "learning_rate": 7.579280172020919e-10, + "loss": 0.4601, + "step": 12841 + }, + { + "epoch": 1.0, + "grad_norm": 1.13989950648964, + "learning_rate": 7.273082227532424e-10, + "loss": 0.5087, + "step": 12842 + }, + { + "epoch": 1.0, + "grad_norm": 1.2908781731593217, + "learning_rate": 6.97319733977686e-10, + "loss": 0.5117, + "step": 12843 + }, + { + "epoch": 1.0, + "grad_norm": 1.1354390180061806, + "learning_rate": 6.679625527716837e-10, + "loss": 0.4893, + "step": 12844 + }, + { + "epoch": 1.0, + "grad_norm": 1.3138743425755401, + "learning_rate": 6.392366809859773e-10, + "loss": 0.5462, + "step": 12845 + }, + { + "epoch": 1.0, + "grad_norm": 1.2061990381257566, + "learning_rate": 6.111421204357814e-10, + "loss": 0.4884, + "step": 12846 + }, + { + "epoch": 1.0, + "grad_norm": 1.2542096302788721, + "learning_rate": 5.836788728930121e-10, + "loss": 0.5311, + "step": 12847 + }, + { + "epoch": 1.0, + "grad_norm": 1.3202656393369088, + "learning_rate": 5.568469400940579e-10, + "loss": 0.5222, + "step": 12848 + }, + { + "epoch": 1.0, + "grad_norm": 1.3227679652146, + "learning_rate": 5.30646323730899e-10, + "loss": 0.5325, + "step": 12849 + }, + { + "epoch": 1.0, + "grad_norm": 1.2876945867063403, + "learning_rate": 5.050770254588777e-10, + "loss": 0.5751, + "step": 12850 + }, + { + "epoch": 1.0, + "grad_norm": 1.187574032935247, + "learning_rate": 4.801390468922584e-10, + "loss": 0.4579, + "step": 12851 + }, + { + "epoch": 1.0, + "grad_norm": 1.285203463258182, + "learning_rate": 4.5583238960533736e-10, + "loss": 0.5233, + "step": 12852 + }, + { + "epoch": 1.0, + "grad_norm": 1.1133538473451257, + "learning_rate": 4.321570551324428e-10, + "loss": 0.4226, + "step": 12853 + }, + { + "epoch": 1.0, + "grad_norm": 1.1663889838318557, + "learning_rate": 4.091130449679348e-10, + "loss": 0.4677, + "step": 12854 + }, + { + "epoch": 1.0, + "grad_norm": 1.1724266788193352, + "learning_rate": 3.8670036056731585e-10, + "loss": 0.493, + "step": 12855 + }, + { + "epoch": 1.0, + "grad_norm": 1.13859785568944, + "learning_rate": 3.649190033461203e-10, + "loss": 0.4663, + "step": 12856 + }, + { + "epoch": 1.0, + "grad_norm": 1.2017358463883105, + "learning_rate": 3.43768974677694e-10, + "loss": 0.4671, + "step": 12857 + }, + { + "epoch": 1.0, + "grad_norm": 1.3005809916172948, + "learning_rate": 3.232502758998557e-10, + "loss": 0.5257, + "step": 12858 + }, + { + "epoch": 1.0, + "grad_norm": 1.1712508001497808, + "learning_rate": 3.0336290830601523e-10, + "loss": 0.4861, + "step": 12859 + }, + { + "epoch": 1.0, + "grad_norm": 1.1428841466353052, + "learning_rate": 2.8410687315294504e-10, + "loss": 0.4977, + "step": 12860 + }, + { + "epoch": 1.0, + "grad_norm": 1.3101388037287138, + "learning_rate": 2.6548217165633937e-10, + "loss": 0.5179, + "step": 12861 + }, + { + "epoch": 1.0, + "grad_norm": 1.1900309144340138, + "learning_rate": 2.474888049908142e-10, + "loss": 0.4883, + "step": 12862 + }, + { + "epoch": 1.0, + "grad_norm": 1.0761382552916974, + "learning_rate": 2.3012677429323782e-10, + "loss": 0.4621, + "step": 12863 + }, + { + "epoch": 1.0, + "grad_norm": 1.1498276996075498, + "learning_rate": 2.1339608066051064e-10, + "loss": 0.5184, + "step": 12864 + }, + { + "epoch": 1.0, + "grad_norm": 1.2162790436526083, + "learning_rate": 1.9729672514845477e-10, + "loss": 0.4903, + "step": 12865 + }, + { + "epoch": 1.0, + "grad_norm": 1.364325050002346, + "learning_rate": 1.81828708771814e-10, + "loss": 0.5696, + "step": 12866 + }, + { + "epoch": 1.0, + "grad_norm": 1.210383036020965, + "learning_rate": 1.669920325098051e-10, + "loss": 0.5131, + "step": 12867 + }, + { + "epoch": 1.0, + "grad_norm": 1.2000195938735925, + "learning_rate": 1.5278669729723582e-10, + "loss": 0.4936, + "step": 12868 + }, + { + "epoch": 1.0, + "grad_norm": 1.17990206352433, + "learning_rate": 1.392127040322766e-10, + "loss": 0.497, + "step": 12869 + }, + { + "epoch": 1.0, + "grad_norm": 1.1404167403217738, + "learning_rate": 1.2627005357090938e-10, + "loss": 0.4577, + "step": 12870 + }, + { + "epoch": 1.0, + "grad_norm": 1.2433683912194202, + "learning_rate": 1.1395874673136853e-10, + "loss": 0.5056, + "step": 12871 + }, + { + "epoch": 1.0, + "grad_norm": 1.2172539160338953, + "learning_rate": 1.0227878428969995e-10, + "loss": 0.4555, + "step": 12872 + }, + { + "epoch": 1.0, + "grad_norm": 1.2806800411938544, + "learning_rate": 9.123016698309173e-11, + "loss": 0.4979, + "step": 12873 + }, + { + "epoch": 1.0, + "grad_norm": 1.2302234223442006, + "learning_rate": 8.081289551098437e-11, + "loss": 0.4998, + "step": 12874 + }, + { + "epoch": 1.0, + "grad_norm": 1.265595094303915, + "learning_rate": 7.102697052951968e-11, + "loss": 0.4699, + "step": 12875 + }, + { + "epoch": 1.0, + "grad_norm": 1.3867426050847587, + "learning_rate": 6.187239265709188e-11, + "loss": 0.5241, + "step": 12876 + }, + { + "epoch": 1.0, + "grad_norm": 1.126374358869901, + "learning_rate": 5.334916247212718e-11, + "loss": 0.4993, + "step": 12877 + }, + { + "epoch": 1.0, + "grad_norm": 1.2527189248455288, + "learning_rate": 4.5457280510863286e-11, + "loss": 0.5114, + "step": 12878 + }, + { + "epoch": 1.0, + "grad_norm": 1.2873742810691888, + "learning_rate": 3.8196747274010794e-11, + "loss": 0.5341, + "step": 12879 + }, + { + "epoch": 1.0, + "grad_norm": 1.073346480907056, + "learning_rate": 3.1567563217871356e-11, + "loss": 0.4682, + "step": 12880 + }, + { + "epoch": 1.0, + "grad_norm": 1.216398121752131, + "learning_rate": 2.5569728763219505e-11, + "loss": 0.4824, + "step": 12881 + }, + { + "epoch": 1.0, + "grad_norm": 1.237128506395995, + "learning_rate": 2.0203244286420842e-11, + "loss": 0.498, + "step": 12882 + }, + { + "epoch": 1.0, + "grad_norm": 1.2252380606881614, + "learning_rate": 1.546811012720362e-11, + "loss": 0.5131, + "step": 12883 + }, + { + "epoch": 1.0, + "grad_norm": 1.1420296681286557, + "learning_rate": 1.1364326585328045e-11, + "loss": 0.4729, + "step": 12884 + }, + { + "epoch": 1.0, + "grad_norm": 1.1551170468989012, + "learning_rate": 7.891893918365868e-12, + "loss": 0.4921, + "step": 12885 + }, + { + "epoch": 1.0, + "grad_norm": 1.5112252625112261, + "learning_rate": 5.0508123472514655e-12, + "loss": 0.4912, + "step": 12886 + }, + { + "epoch": 1.0, + "grad_norm": 1.1867831726870262, + "learning_rate": 2.841082049620525e-12, + "loss": 0.5362, + "step": 12887 + }, + { + "epoch": 1.0, + "grad_norm": 1.3153736807745786, + "learning_rate": 1.2627031664713685e-12, + "loss": 0.5467, + "step": 12888 + }, + { + "epoch": 1.0, + "grad_norm": 1.3907085779341495, + "learning_rate": 3.156757966138457e-13, + "loss": 0.5349, + "step": 12889 + }, + { + "epoch": 1.0, + "grad_norm": 1.3798019914655004, + "learning_rate": 0.0, + "loss": 0.4959, + "step": 12890 + }, + { + "epoch": 1.0, + "step": 12890, + "total_flos": 4.01818824385495e+16, + "train_loss": 0.5671361568664931, + "train_runtime": 271490.8709, + "train_samples_per_second": 6.077, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1.0, + "max_steps": 12890, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 700, + "total_flos": 4.01818824385495e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}