{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5090353779587682, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.18668322684139083, "learning_rate": 0.0, "loss": 1.8394, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.20650988735469475, "learning_rate": 0.0001, "loss": 2.3141, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.22046919046056535, "learning_rate": 0.0001, "loss": 2.1404, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.23873050737446988, "learning_rate": 9.997453526865293e-05, "loss": 2.1978, "step": 4 }, { "epoch": 0.0, "grad_norm": 0.2081567472868006, "learning_rate": 9.994907053730583e-05, "loss": 1.797, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.1886675070025996, "learning_rate": 9.992360580595875e-05, "loss": 1.6845, "step": 6 }, { "epoch": 0.0, "grad_norm": 0.1767301801295483, "learning_rate": 9.989814107461167e-05, "loss": 2.0519, "step": 7 }, { "epoch": 0.0, "grad_norm": 0.22052457220349336, "learning_rate": 9.987267634326458e-05, "loss": 1.9004, "step": 8 }, { "epoch": 0.0, "grad_norm": 0.2561394854396737, "learning_rate": 9.98472116119175e-05, "loss": 2.2788, "step": 9 }, { "epoch": 0.0, "grad_norm": 0.2309558457067922, "learning_rate": 9.982174688057042e-05, "loss": 2.036, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.2795391365887652, "learning_rate": 9.979628214922333e-05, "loss": 1.9411, "step": 11 }, { "epoch": 0.0, "grad_norm": 0.2718455156145512, "learning_rate": 9.977081741787625e-05, "loss": 1.8239, "step": 12 }, { "epoch": 0.0, "grad_norm": 0.24375797038618635, "learning_rate": 9.974535268652915e-05, "loss": 1.7713, "step": 13 }, { "epoch": 0.0, "grad_norm": 0.3212734322963607, "learning_rate": 9.971988795518208e-05, "loss": 2.0731, "step": 14 }, { "epoch": 0.0, "grad_norm": 0.2949670415991085, "learning_rate": 9.9694423223835e-05, "loss": 1.7618, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.26650378771339317, "learning_rate": 9.96689584924879e-05, "loss": 2.0073, "step": 16 }, { "epoch": 0.0, "grad_norm": 0.2303602561325778, "learning_rate": 9.964349376114083e-05, "loss": 1.8402, "step": 17 }, { "epoch": 0.0, "grad_norm": 0.19071474503273808, "learning_rate": 9.961802902979374e-05, "loss": 1.6371, "step": 18 }, { "epoch": 0.0, "grad_norm": 0.21247058517565695, "learning_rate": 9.959256429844665e-05, "loss": 1.7291, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.29788490484004526, "learning_rate": 9.956709956709958e-05, "loss": 2.1516, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.24952736523685068, "learning_rate": 9.954163483575249e-05, "loss": 1.8062, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.24339158164984467, "learning_rate": 9.95161701044054e-05, "loss": 2.0696, "step": 22 }, { "epoch": 0.01, "grad_norm": 0.24682448902556342, "learning_rate": 9.949070537305833e-05, "loss": 1.8974, "step": 23 }, { "epoch": 0.01, "grad_norm": 0.2520444837431572, "learning_rate": 9.946524064171123e-05, "loss": 1.9982, "step": 24 }, { "epoch": 0.01, "grad_norm": 0.21667345503265834, "learning_rate": 9.943977591036415e-05, "loss": 1.8855, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.20205030040213595, "learning_rate": 9.941431117901706e-05, "loss": 1.6719, "step": 26 }, { "epoch": 0.01, "grad_norm": 0.25663378051735036, "learning_rate": 9.938884644766998e-05, "loss": 1.8823, "step": 27 }, { "epoch": 0.01, "grad_norm": 0.24494613515141275, "learning_rate": 9.93633817163229e-05, "loss": 1.7899, "step": 28 }, { "epoch": 0.01, "grad_norm": 0.22728921000088043, "learning_rate": 9.933791698497581e-05, "loss": 2.0576, "step": 29 }, { "epoch": 0.01, "grad_norm": 0.20033400778675428, "learning_rate": 9.931245225362873e-05, "loss": 1.8655, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.2458925257086536, "learning_rate": 9.928698752228165e-05, "loss": 1.9585, "step": 31 }, { "epoch": 0.01, "grad_norm": 0.1861874815322032, "learning_rate": 9.926152279093456e-05, "loss": 1.9062, "step": 32 }, { "epoch": 0.01, "grad_norm": 0.2199191542164078, "learning_rate": 9.923605805958748e-05, "loss": 2.0163, "step": 33 }, { "epoch": 0.01, "grad_norm": 0.2168328882496767, "learning_rate": 9.921059332824039e-05, "loss": 1.9842, "step": 34 }, { "epoch": 0.01, "grad_norm": 0.21647678836894843, "learning_rate": 9.91851285968933e-05, "loss": 1.7001, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.20657337587150454, "learning_rate": 9.915966386554623e-05, "loss": 1.9808, "step": 36 }, { "epoch": 0.01, "grad_norm": 0.22285460119825284, "learning_rate": 9.913419913419914e-05, "loss": 2.1209, "step": 37 }, { "epoch": 0.01, "grad_norm": 0.2037563233184191, "learning_rate": 9.910873440285205e-05, "loss": 1.6725, "step": 38 }, { "epoch": 0.01, "grad_norm": 0.21991517343016573, "learning_rate": 9.908326967150498e-05, "loss": 2.0868, "step": 39 }, { "epoch": 0.01, "grad_norm": 0.21562313548954337, "learning_rate": 9.905780494015789e-05, "loss": 2.0301, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.26596374509967907, "learning_rate": 9.90323402088108e-05, "loss": 2.0505, "step": 41 }, { "epoch": 0.01, "grad_norm": 0.24973537924167893, "learning_rate": 9.900687547746372e-05, "loss": 2.4615, "step": 42 }, { "epoch": 0.01, "grad_norm": 0.21510668184289308, "learning_rate": 9.898141074611664e-05, "loss": 1.7368, "step": 43 }, { "epoch": 0.01, "grad_norm": 0.1723812623411351, "learning_rate": 9.895594601476955e-05, "loss": 1.6354, "step": 44 }, { "epoch": 0.01, "grad_norm": 0.22746169121427556, "learning_rate": 9.893048128342246e-05, "loss": 2.1088, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.27176215691032246, "learning_rate": 9.890501655207539e-05, "loss": 2.0887, "step": 46 }, { "epoch": 0.01, "grad_norm": 0.2113572994267603, "learning_rate": 9.88795518207283e-05, "loss": 1.8923, "step": 47 }, { "epoch": 0.01, "grad_norm": 0.2311921082666697, "learning_rate": 9.885408708938121e-05, "loss": 2.0053, "step": 48 }, { "epoch": 0.01, "grad_norm": 0.21631334700001129, "learning_rate": 9.882862235803412e-05, "loss": 1.8493, "step": 49 }, { "epoch": 0.01, "grad_norm": 0.2131940009966438, "learning_rate": 9.880315762668705e-05, "loss": 1.8507, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.26949683259896795, "learning_rate": 9.877769289533996e-05, "loss": 1.9211, "step": 51 }, { "epoch": 0.01, "grad_norm": 0.20666923901728515, "learning_rate": 9.875222816399287e-05, "loss": 1.7817, "step": 52 }, { "epoch": 0.01, "grad_norm": 0.17479038712810302, "learning_rate": 9.872676343264578e-05, "loss": 1.7987, "step": 53 }, { "epoch": 0.01, "grad_norm": 0.26824054724737734, "learning_rate": 9.870129870129871e-05, "loss": 1.9868, "step": 54 }, { "epoch": 0.01, "grad_norm": 0.2544263940578613, "learning_rate": 9.867583396995162e-05, "loss": 2.0314, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.2806195637276212, "learning_rate": 9.865036923860453e-05, "loss": 2.0125, "step": 56 }, { "epoch": 0.01, "grad_norm": 0.19544621658649786, "learning_rate": 9.862490450725746e-05, "loss": 1.7809, "step": 57 }, { "epoch": 0.01, "grad_norm": 0.2504147724761243, "learning_rate": 9.859943977591037e-05, "loss": 2.1766, "step": 58 }, { "epoch": 0.02, "grad_norm": 0.22152374260830437, "learning_rate": 9.857397504456328e-05, "loss": 2.0083, "step": 59 }, { "epoch": 0.02, "grad_norm": 0.20102838380117335, "learning_rate": 9.85485103132162e-05, "loss": 1.9817, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.21677533463700935, "learning_rate": 9.852304558186911e-05, "loss": 1.9689, "step": 61 }, { "epoch": 0.02, "grad_norm": 0.20905116609320215, "learning_rate": 9.849758085052203e-05, "loss": 2.0173, "step": 62 }, { "epoch": 0.02, "grad_norm": 0.21550938056512234, "learning_rate": 9.847211611917495e-05, "loss": 2.1043, "step": 63 }, { "epoch": 0.02, "grad_norm": 0.23724001721462154, "learning_rate": 9.844665138782786e-05, "loss": 2.0639, "step": 64 }, { "epoch": 0.02, "grad_norm": 0.2201372504716533, "learning_rate": 9.842118665648078e-05, "loss": 2.085, "step": 65 }, { "epoch": 0.02, "grad_norm": 0.2355750751853312, "learning_rate": 9.83957219251337e-05, "loss": 1.9968, "step": 66 }, { "epoch": 0.02, "grad_norm": 0.17637378642261511, "learning_rate": 9.837025719378661e-05, "loss": 1.771, "step": 67 }, { "epoch": 0.02, "grad_norm": 0.25906351758805374, "learning_rate": 9.834479246243953e-05, "loss": 2.2, "step": 68 }, { "epoch": 0.02, "grad_norm": 0.2275259890095465, "learning_rate": 9.831932773109243e-05, "loss": 1.8159, "step": 69 }, { "epoch": 0.02, "grad_norm": 0.1980881491845839, "learning_rate": 9.829386299974536e-05, "loss": 1.9643, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.2058717539357096, "learning_rate": 9.826839826839827e-05, "loss": 2.1604, "step": 71 }, { "epoch": 0.02, "grad_norm": 0.2143307195813568, "learning_rate": 9.824293353705118e-05, "loss": 1.7573, "step": 72 }, { "epoch": 0.02, "grad_norm": 0.23338330590904638, "learning_rate": 9.821746880570411e-05, "loss": 2.0012, "step": 73 }, { "epoch": 0.02, "grad_norm": 0.27815596429569944, "learning_rate": 9.819200407435702e-05, "loss": 1.9551, "step": 74 }, { "epoch": 0.02, "grad_norm": 0.22362637406455313, "learning_rate": 9.816653934300993e-05, "loss": 1.9162, "step": 75 }, { "epoch": 0.02, "grad_norm": 0.21896329766757727, "learning_rate": 9.814107461166286e-05, "loss": 2.0692, "step": 76 }, { "epoch": 0.02, "grad_norm": 0.22342664885681365, "learning_rate": 9.811560988031576e-05, "loss": 1.8841, "step": 77 }, { "epoch": 0.02, "grad_norm": 0.22785719541652857, "learning_rate": 9.809014514896868e-05, "loss": 1.9784, "step": 78 }, { "epoch": 0.02, "grad_norm": 0.26447957975744196, "learning_rate": 9.806468041762161e-05, "loss": 1.7973, "step": 79 }, { "epoch": 0.02, "grad_norm": 0.27247251807318934, "learning_rate": 9.80392156862745e-05, "loss": 1.9147, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.20979533207130613, "learning_rate": 9.801375095492743e-05, "loss": 1.9589, "step": 81 }, { "epoch": 0.02, "grad_norm": 0.20882101001884892, "learning_rate": 9.798828622358034e-05, "loss": 1.8915, "step": 82 }, { "epoch": 0.02, "grad_norm": 0.2303846254211127, "learning_rate": 9.796282149223326e-05, "loss": 1.8688, "step": 83 }, { "epoch": 0.02, "grad_norm": 0.195074823723581, "learning_rate": 9.793735676088618e-05, "loss": 1.9336, "step": 84 }, { "epoch": 0.02, "grad_norm": 0.21544466079041769, "learning_rate": 9.791189202953909e-05, "loss": 1.9476, "step": 85 }, { "epoch": 0.02, "grad_norm": 0.21979704680042544, "learning_rate": 9.7886427298192e-05, "loss": 1.8772, "step": 86 }, { "epoch": 0.02, "grad_norm": 0.2468756032888067, "learning_rate": 9.786096256684493e-05, "loss": 1.9661, "step": 87 }, { "epoch": 0.02, "grad_norm": 0.21893561526354924, "learning_rate": 9.783549783549783e-05, "loss": 1.8254, "step": 88 }, { "epoch": 0.02, "grad_norm": 0.22456850426384328, "learning_rate": 9.781003310415076e-05, "loss": 1.9367, "step": 89 }, { "epoch": 0.02, "grad_norm": 0.21911173951413468, "learning_rate": 9.778456837280368e-05, "loss": 1.823, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.18737492760853453, "learning_rate": 9.775910364145658e-05, "loss": 1.7855, "step": 91 }, { "epoch": 0.02, "grad_norm": 0.23796009872049736, "learning_rate": 9.77336389101095e-05, "loss": 2.1084, "step": 92 }, { "epoch": 0.02, "grad_norm": 0.24563292848492027, "learning_rate": 9.770817417876243e-05, "loss": 2.1699, "step": 93 }, { "epoch": 0.02, "grad_norm": 0.20949684127939133, "learning_rate": 9.768270944741533e-05, "loss": 2.1912, "step": 94 }, { "epoch": 0.02, "grad_norm": 0.19708490119220384, "learning_rate": 9.765724471606825e-05, "loss": 1.4705, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.2343314289283105, "learning_rate": 9.763177998472117e-05, "loss": 1.696, "step": 96 }, { "epoch": 0.02, "grad_norm": 0.21503323955083936, "learning_rate": 9.760631525337408e-05, "loss": 1.8542, "step": 97 }, { "epoch": 0.02, "grad_norm": 0.2317012546697141, "learning_rate": 9.7580850522027e-05, "loss": 1.7464, "step": 98 }, { "epoch": 0.03, "grad_norm": 0.20571972527548762, "learning_rate": 9.75553857906799e-05, "loss": 1.8839, "step": 99 }, { "epoch": 0.03, "grad_norm": 0.26962345039017876, "learning_rate": 9.752992105933283e-05, "loss": 2.1152, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.21136096551098874, "learning_rate": 9.750445632798575e-05, "loss": 1.794, "step": 101 }, { "epoch": 0.03, "grad_norm": 0.2624788065710887, "learning_rate": 9.747899159663865e-05, "loss": 2.0013, "step": 102 }, { "epoch": 0.03, "grad_norm": 0.20572013271793682, "learning_rate": 9.745352686529158e-05, "loss": 2.051, "step": 103 }, { "epoch": 0.03, "grad_norm": 0.2034459055059183, "learning_rate": 9.742806213394449e-05, "loss": 1.8117, "step": 104 }, { "epoch": 0.03, "grad_norm": 0.18901260521522967, "learning_rate": 9.74025974025974e-05, "loss": 1.8273, "step": 105 }, { "epoch": 0.03, "grad_norm": 0.23189693763581237, "learning_rate": 9.737713267125033e-05, "loss": 1.762, "step": 106 }, { "epoch": 0.03, "grad_norm": 0.22780526433581438, "learning_rate": 9.735166793990324e-05, "loss": 1.7569, "step": 107 }, { "epoch": 0.03, "grad_norm": 0.18899027335403254, "learning_rate": 9.732620320855615e-05, "loss": 1.7129, "step": 108 }, { "epoch": 0.03, "grad_norm": 0.19854782296756943, "learning_rate": 9.730073847720908e-05, "loss": 1.9007, "step": 109 }, { "epoch": 0.03, "grad_norm": 0.20392060357214004, "learning_rate": 9.727527374586199e-05, "loss": 1.6141, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.2378622341910083, "learning_rate": 9.72498090145149e-05, "loss": 1.9192, "step": 111 }, { "epoch": 0.03, "grad_norm": 0.2833162936645229, "learning_rate": 9.722434428316781e-05, "loss": 1.9424, "step": 112 }, { "epoch": 0.03, "grad_norm": 0.195262101346659, "learning_rate": 9.719887955182073e-05, "loss": 1.8432, "step": 113 }, { "epoch": 0.03, "grad_norm": 0.18626690744618216, "learning_rate": 9.717341482047365e-05, "loss": 1.7522, "step": 114 }, { "epoch": 0.03, "grad_norm": 0.25144789555295355, "learning_rate": 9.714795008912656e-05, "loss": 2.1025, "step": 115 }, { "epoch": 0.03, "grad_norm": 0.2507526720110009, "learning_rate": 9.712248535777948e-05, "loss": 1.9353, "step": 116 }, { "epoch": 0.03, "grad_norm": 0.2398695928710626, "learning_rate": 9.70970206264324e-05, "loss": 1.7189, "step": 117 }, { "epoch": 0.03, "grad_norm": 0.2049264892192432, "learning_rate": 9.707155589508531e-05, "loss": 1.81, "step": 118 }, { "epoch": 0.03, "grad_norm": 0.21239921304434395, "learning_rate": 9.704609116373823e-05, "loss": 1.8867, "step": 119 }, { "epoch": 0.03, "grad_norm": 0.19948118352409933, "learning_rate": 9.702062643239115e-05, "loss": 1.7714, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.23275057205842845, "learning_rate": 9.699516170104406e-05, "loss": 2.0321, "step": 121 }, { "epoch": 0.03, "grad_norm": 0.25282056951198856, "learning_rate": 9.696969696969698e-05, "loss": 2.1878, "step": 122 }, { "epoch": 0.03, "grad_norm": 0.22077724596241188, "learning_rate": 9.694423223834989e-05, "loss": 1.7711, "step": 123 }, { "epoch": 0.03, "grad_norm": 0.24358276719249283, "learning_rate": 9.69187675070028e-05, "loss": 1.7919, "step": 124 }, { "epoch": 0.03, "grad_norm": 0.23687386356473691, "learning_rate": 9.689330277565573e-05, "loss": 1.8732, "step": 125 }, { "epoch": 0.03, "grad_norm": 0.1961942510175504, "learning_rate": 9.686783804430864e-05, "loss": 1.8143, "step": 126 }, { "epoch": 0.03, "grad_norm": 0.24989519306549782, "learning_rate": 9.684237331296155e-05, "loss": 1.9983, "step": 127 }, { "epoch": 0.03, "grad_norm": 0.18367867838900498, "learning_rate": 9.681690858161448e-05, "loss": 1.8044, "step": 128 }, { "epoch": 0.03, "grad_norm": 0.1851171637376302, "learning_rate": 9.679144385026739e-05, "loss": 1.5508, "step": 129 }, { "epoch": 0.03, "grad_norm": 0.1981646287407315, "learning_rate": 9.67659791189203e-05, "loss": 1.6866, "step": 130 }, { "epoch": 0.03, "grad_norm": 0.25168898879511503, "learning_rate": 9.674051438757321e-05, "loss": 1.9691, "step": 131 }, { "epoch": 0.03, "grad_norm": 0.2406960292737266, "learning_rate": 9.671504965622614e-05, "loss": 1.8737, "step": 132 }, { "epoch": 0.03, "grad_norm": 0.23760573204480365, "learning_rate": 9.668958492487905e-05, "loss": 2.0618, "step": 133 }, { "epoch": 0.03, "grad_norm": 0.21245060514928146, "learning_rate": 9.666412019353196e-05, "loss": 1.7938, "step": 134 }, { "epoch": 0.03, "grad_norm": 0.23616623653756544, "learning_rate": 9.663865546218487e-05, "loss": 2.2237, "step": 135 }, { "epoch": 0.03, "grad_norm": 0.24385343826643593, "learning_rate": 9.66131907308378e-05, "loss": 2.1726, "step": 136 }, { "epoch": 0.03, "grad_norm": 0.2086523305951323, "learning_rate": 9.658772599949071e-05, "loss": 2.0386, "step": 137 }, { "epoch": 0.04, "grad_norm": 0.2209783795615669, "learning_rate": 9.656226126814362e-05, "loss": 1.8839, "step": 138 }, { "epoch": 0.04, "grad_norm": 0.20788942469293714, "learning_rate": 9.653679653679654e-05, "loss": 1.7717, "step": 139 }, { "epoch": 0.04, "grad_norm": 0.24552151589399218, "learning_rate": 9.651133180544946e-05, "loss": 1.9754, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.2176532927132165, "learning_rate": 9.648586707410237e-05, "loss": 2.0004, "step": 141 }, { "epoch": 0.04, "grad_norm": 0.23477900176569336, "learning_rate": 9.646040234275528e-05, "loss": 2.0929, "step": 142 }, { "epoch": 0.04, "grad_norm": 0.2599706772628387, "learning_rate": 9.643493761140821e-05, "loss": 1.9682, "step": 143 }, { "epoch": 0.04, "grad_norm": 0.2654960824368493, "learning_rate": 9.640947288006112e-05, "loss": 1.927, "step": 144 }, { "epoch": 0.04, "grad_norm": 0.20820691425532545, "learning_rate": 9.638400814871403e-05, "loss": 1.8643, "step": 145 }, { "epoch": 0.04, "grad_norm": 0.25697305594483716, "learning_rate": 9.635854341736695e-05, "loss": 1.9165, "step": 146 }, { "epoch": 0.04, "grad_norm": 0.2342038483361337, "learning_rate": 9.633307868601986e-05, "loss": 1.8247, "step": 147 }, { "epoch": 0.04, "grad_norm": 0.22760589993222088, "learning_rate": 9.630761395467278e-05, "loss": 1.7226, "step": 148 }, { "epoch": 0.04, "grad_norm": 0.24795127888830662, "learning_rate": 9.62821492233257e-05, "loss": 1.926, "step": 149 }, { "epoch": 0.04, "grad_norm": 0.21953914702795194, "learning_rate": 9.625668449197861e-05, "loss": 1.7582, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.2197624516317337, "learning_rate": 9.623121976063153e-05, "loss": 1.656, "step": 151 }, { "epoch": 0.04, "grad_norm": 0.2309760958845721, "learning_rate": 9.620575502928445e-05, "loss": 1.8533, "step": 152 }, { "epoch": 0.04, "grad_norm": 0.2568432331862067, "learning_rate": 9.618029029793736e-05, "loss": 2.1763, "step": 153 }, { "epoch": 0.04, "grad_norm": 0.24345830556671422, "learning_rate": 9.615482556659028e-05, "loss": 1.9257, "step": 154 }, { "epoch": 0.04, "grad_norm": 0.22500400705213244, "learning_rate": 9.612936083524318e-05, "loss": 1.7517, "step": 155 }, { "epoch": 0.04, "grad_norm": 0.2244433577284875, "learning_rate": 9.610389610389611e-05, "loss": 1.8401, "step": 156 }, { "epoch": 0.04, "grad_norm": 0.2966025758924672, "learning_rate": 9.607843137254903e-05, "loss": 2.049, "step": 157 }, { "epoch": 0.04, "grad_norm": 0.2554714815186226, "learning_rate": 9.605296664120193e-05, "loss": 1.9473, "step": 158 }, { "epoch": 0.04, "grad_norm": 0.2942129291965485, "learning_rate": 9.602750190985486e-05, "loss": 2.2011, "step": 159 }, { "epoch": 0.04, "grad_norm": 0.2525935056081843, "learning_rate": 9.600203717850777e-05, "loss": 1.8666, "step": 160 }, { "epoch": 0.04, "grad_norm": 0.28258844953158013, "learning_rate": 9.597657244716068e-05, "loss": 1.9187, "step": 161 }, { "epoch": 0.04, "grad_norm": 0.24764972341222816, "learning_rate": 9.595110771581361e-05, "loss": 1.9439, "step": 162 }, { "epoch": 0.04, "grad_norm": 0.22459914142681472, "learning_rate": 9.592564298446652e-05, "loss": 2.0468, "step": 163 }, { "epoch": 0.04, "grad_norm": 0.2383724960346664, "learning_rate": 9.590017825311943e-05, "loss": 1.8381, "step": 164 }, { "epoch": 0.04, "grad_norm": 0.2579963779731197, "learning_rate": 9.587471352177236e-05, "loss": 2.0741, "step": 165 }, { "epoch": 0.04, "grad_norm": 0.22364171605939087, "learning_rate": 9.584924879042526e-05, "loss": 1.6662, "step": 166 }, { "epoch": 0.04, "grad_norm": 0.23417480183675768, "learning_rate": 9.582378405907818e-05, "loss": 1.7069, "step": 167 }, { "epoch": 0.04, "grad_norm": 0.25343272730311545, "learning_rate": 9.579831932773111e-05, "loss": 1.763, "step": 168 }, { "epoch": 0.04, "grad_norm": 0.2517659726829119, "learning_rate": 9.5772854596384e-05, "loss": 2.1275, "step": 169 }, { "epoch": 0.04, "grad_norm": 0.25888382238424834, "learning_rate": 9.574738986503693e-05, "loss": 1.7395, "step": 170 }, { "epoch": 0.04, "grad_norm": 0.21638117276537386, "learning_rate": 9.572192513368984e-05, "loss": 1.9478, "step": 171 }, { "epoch": 0.04, "grad_norm": 0.19808260127052518, "learning_rate": 9.569646040234276e-05, "loss": 1.6967, "step": 172 }, { "epoch": 0.04, "grad_norm": 0.19907375744019035, "learning_rate": 9.567099567099568e-05, "loss": 1.7707, "step": 173 }, { "epoch": 0.04, "grad_norm": 0.2459730124614947, "learning_rate": 9.56455309396486e-05, "loss": 1.8615, "step": 174 }, { "epoch": 0.04, "grad_norm": 0.25489061716341954, "learning_rate": 9.56200662083015e-05, "loss": 2.1669, "step": 175 }, { "epoch": 0.04, "grad_norm": 0.22452811554198862, "learning_rate": 9.559460147695443e-05, "loss": 2.1389, "step": 176 }, { "epoch": 0.05, "grad_norm": 0.23382342125125047, "learning_rate": 9.556913674560733e-05, "loss": 1.9017, "step": 177 }, { "epoch": 0.05, "grad_norm": 0.19184570079482094, "learning_rate": 9.554367201426026e-05, "loss": 1.586, "step": 178 }, { "epoch": 0.05, "grad_norm": 0.23668167371557072, "learning_rate": 9.551820728291318e-05, "loss": 2.0936, "step": 179 }, { "epoch": 0.05, "grad_norm": 0.20898807379756024, "learning_rate": 9.549274255156608e-05, "loss": 1.8818, "step": 180 }, { "epoch": 0.05, "grad_norm": 0.2396599251528385, "learning_rate": 9.5467277820219e-05, "loss": 2.07, "step": 181 }, { "epoch": 0.05, "grad_norm": 0.20789926230727251, "learning_rate": 9.544181308887192e-05, "loss": 1.963, "step": 182 }, { "epoch": 0.05, "grad_norm": 0.26476631495390657, "learning_rate": 9.541634835752483e-05, "loss": 1.8445, "step": 183 }, { "epoch": 0.05, "grad_norm": 0.2273176941900955, "learning_rate": 9.539088362617775e-05, "loss": 1.9145, "step": 184 }, { "epoch": 0.05, "grad_norm": 0.23125805905191937, "learning_rate": 9.536541889483067e-05, "loss": 1.9857, "step": 185 }, { "epoch": 0.05, "grad_norm": 0.25186137813337384, "learning_rate": 9.533995416348358e-05, "loss": 2.0566, "step": 186 }, { "epoch": 0.05, "grad_norm": 0.30429619334858266, "learning_rate": 9.53144894321365e-05, "loss": 1.9218, "step": 187 }, { "epoch": 0.05, "grad_norm": 0.2087098395097207, "learning_rate": 9.52890247007894e-05, "loss": 2.0563, "step": 188 }, { "epoch": 0.05, "grad_norm": 0.210832825933067, "learning_rate": 9.526355996944233e-05, "loss": 1.6114, "step": 189 }, { "epoch": 0.05, "grad_norm": 0.26623973116973887, "learning_rate": 9.523809523809524e-05, "loss": 2.305, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.23251965012881762, "learning_rate": 9.521263050674815e-05, "loss": 2.0812, "step": 191 }, { "epoch": 0.05, "grad_norm": 0.23477609011259198, "learning_rate": 9.518716577540108e-05, "loss": 1.8897, "step": 192 }, { "epoch": 0.05, "grad_norm": 0.2121841537706127, "learning_rate": 9.516170104405399e-05, "loss": 1.8104, "step": 193 }, { "epoch": 0.05, "grad_norm": 0.21000993733508805, "learning_rate": 9.51362363127069e-05, "loss": 1.761, "step": 194 }, { "epoch": 0.05, "grad_norm": 0.2412983353771996, "learning_rate": 9.511077158135983e-05, "loss": 1.9648, "step": 195 }, { "epoch": 0.05, "grad_norm": 0.21809671979675835, "learning_rate": 9.508530685001274e-05, "loss": 1.9521, "step": 196 }, { "epoch": 0.05, "grad_norm": 0.2196701570803616, "learning_rate": 9.505984211866565e-05, "loss": 1.962, "step": 197 }, { "epoch": 0.05, "grad_norm": 0.20178135884970153, "learning_rate": 9.503437738731858e-05, "loss": 1.8271, "step": 198 }, { "epoch": 0.05, "grad_norm": 0.2165889430175184, "learning_rate": 9.500891265597148e-05, "loss": 1.8558, "step": 199 }, { "epoch": 0.05, "grad_norm": 0.21204500503733067, "learning_rate": 9.49834479246244e-05, "loss": 1.8737, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.2448504390910478, "learning_rate": 9.495798319327731e-05, "loss": 1.7742, "step": 201 }, { "epoch": 0.05, "grad_norm": 0.24421357400135402, "learning_rate": 9.493251846193023e-05, "loss": 2.0813, "step": 202 }, { "epoch": 0.05, "grad_norm": 0.24034686672484812, "learning_rate": 9.490705373058315e-05, "loss": 1.6847, "step": 203 }, { "epoch": 0.05, "grad_norm": 0.22251871110699203, "learning_rate": 9.488158899923606e-05, "loss": 2.0008, "step": 204 }, { "epoch": 0.05, "grad_norm": 0.20788696969656037, "learning_rate": 9.485612426788898e-05, "loss": 1.825, "step": 205 }, { "epoch": 0.05, "grad_norm": 0.24342274215879736, "learning_rate": 9.48306595365419e-05, "loss": 2.137, "step": 206 }, { "epoch": 0.05, "grad_norm": 0.21797141240999446, "learning_rate": 9.480519480519481e-05, "loss": 1.919, "step": 207 }, { "epoch": 0.05, "grad_norm": 0.22818316214645618, "learning_rate": 9.477973007384773e-05, "loss": 1.6708, "step": 208 }, { "epoch": 0.05, "grad_norm": 0.24104874289079398, "learning_rate": 9.475426534250064e-05, "loss": 1.8872, "step": 209 }, { "epoch": 0.05, "grad_norm": 0.27131811031770453, "learning_rate": 9.472880061115355e-05, "loss": 2.1951, "step": 210 }, { "epoch": 0.05, "grad_norm": 0.23612631701298523, "learning_rate": 9.470333587980648e-05, "loss": 1.8685, "step": 211 }, { "epoch": 0.05, "grad_norm": 0.25111361668915844, "learning_rate": 9.467787114845939e-05, "loss": 1.9868, "step": 212 }, { "epoch": 0.05, "grad_norm": 0.23129340711654747, "learning_rate": 9.46524064171123e-05, "loss": 1.8138, "step": 213 }, { "epoch": 0.05, "grad_norm": 0.23683270270522913, "learning_rate": 9.462694168576523e-05, "loss": 1.9155, "step": 214 }, { "epoch": 0.05, "grad_norm": 0.2231374860523671, "learning_rate": 9.460147695441814e-05, "loss": 1.6972, "step": 215 }, { "epoch": 0.05, "grad_norm": 0.21331695425466865, "learning_rate": 9.457601222307105e-05, "loss": 1.8439, "step": 216 }, { "epoch": 0.06, "grad_norm": 0.21181957054324804, "learning_rate": 9.455054749172396e-05, "loss": 1.7157, "step": 217 }, { "epoch": 0.06, "grad_norm": 0.229617283591777, "learning_rate": 9.452508276037689e-05, "loss": 2.1186, "step": 218 }, { "epoch": 0.06, "grad_norm": 0.19437555647276678, "learning_rate": 9.44996180290298e-05, "loss": 1.7184, "step": 219 }, { "epoch": 0.06, "grad_norm": 0.23963701987329625, "learning_rate": 9.447415329768271e-05, "loss": 1.8568, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.22261811649298022, "learning_rate": 9.444868856633562e-05, "loss": 1.6762, "step": 221 }, { "epoch": 0.06, "grad_norm": 0.22519402085899096, "learning_rate": 9.442322383498855e-05, "loss": 1.978, "step": 222 }, { "epoch": 0.06, "grad_norm": 0.26627643028672393, "learning_rate": 9.439775910364146e-05, "loss": 2.1728, "step": 223 }, { "epoch": 0.06, "grad_norm": 0.2309815472420271, "learning_rate": 9.437229437229437e-05, "loss": 1.9574, "step": 224 }, { "epoch": 0.06, "grad_norm": 0.23051474807252864, "learning_rate": 9.434682964094729e-05, "loss": 1.995, "step": 225 }, { "epoch": 0.06, "grad_norm": 0.30308261653684976, "learning_rate": 9.432136490960021e-05, "loss": 2.0055, "step": 226 }, { "epoch": 0.06, "grad_norm": 0.22150569759310887, "learning_rate": 9.429590017825312e-05, "loss": 1.8448, "step": 227 }, { "epoch": 0.06, "grad_norm": 0.23139463747136307, "learning_rate": 9.427043544690604e-05, "loss": 2.0128, "step": 228 }, { "epoch": 0.06, "grad_norm": 0.22266980180155999, "learning_rate": 9.424497071555896e-05, "loss": 1.8328, "step": 229 }, { "epoch": 0.06, "grad_norm": 0.22092674539443577, "learning_rate": 9.421950598421187e-05, "loss": 1.4841, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.20078682390780883, "learning_rate": 9.419404125286478e-05, "loss": 1.7392, "step": 231 }, { "epoch": 0.06, "grad_norm": 0.2692318787918702, "learning_rate": 9.416857652151771e-05, "loss": 1.8473, "step": 232 }, { "epoch": 0.06, "grad_norm": 0.22854627909015088, "learning_rate": 9.414311179017061e-05, "loss": 2.0171, "step": 233 }, { "epoch": 0.06, "grad_norm": 0.24774161651527243, "learning_rate": 9.411764705882353e-05, "loss": 1.9486, "step": 234 }, { "epoch": 0.06, "grad_norm": 0.24102782430332428, "learning_rate": 9.409218232747645e-05, "loss": 1.9989, "step": 235 }, { "epoch": 0.06, "grad_norm": 0.2010424021392836, "learning_rate": 9.406671759612936e-05, "loss": 1.7616, "step": 236 }, { "epoch": 0.06, "grad_norm": 0.23034735084682545, "learning_rate": 9.404125286478228e-05, "loss": 1.7533, "step": 237 }, { "epoch": 0.06, "grad_norm": 0.21964697350073992, "learning_rate": 9.40157881334352e-05, "loss": 1.7249, "step": 238 }, { "epoch": 0.06, "grad_norm": 0.2577106823861686, "learning_rate": 9.399032340208811e-05, "loss": 2.1802, "step": 239 }, { "epoch": 0.06, "grad_norm": 0.2224282134304929, "learning_rate": 9.396485867074103e-05, "loss": 1.7209, "step": 240 }, { "epoch": 0.06, "grad_norm": 0.25699890246231155, "learning_rate": 9.393939393939395e-05, "loss": 1.8835, "step": 241 }, { "epoch": 0.06, "grad_norm": 0.3005056604932864, "learning_rate": 9.391392920804686e-05, "loss": 2.058, "step": 242 }, { "epoch": 0.06, "grad_norm": 0.23921523980493628, "learning_rate": 9.388846447669978e-05, "loss": 1.8772, "step": 243 }, { "epoch": 0.06, "grad_norm": 0.2118256907628859, "learning_rate": 9.386299974535268e-05, "loss": 1.9704, "step": 244 }, { "epoch": 0.06, "grad_norm": 0.20004277814897858, "learning_rate": 9.383753501400561e-05, "loss": 1.6381, "step": 245 }, { "epoch": 0.06, "grad_norm": 0.23023427738670452, "learning_rate": 9.381207028265852e-05, "loss": 1.918, "step": 246 }, { "epoch": 0.06, "grad_norm": 0.22430066994725595, "learning_rate": 9.378660555131143e-05, "loss": 1.8846, "step": 247 }, { "epoch": 0.06, "grad_norm": 0.2543002666006967, "learning_rate": 9.376114081996436e-05, "loss": 2.1089, "step": 248 }, { "epoch": 0.06, "grad_norm": 0.2595002101959818, "learning_rate": 9.373567608861727e-05, "loss": 1.9058, "step": 249 }, { "epoch": 0.06, "grad_norm": 0.23391445529028587, "learning_rate": 9.371021135727018e-05, "loss": 1.964, "step": 250 }, { "epoch": 0.06, "grad_norm": 0.37687783156386134, "learning_rate": 9.368474662592311e-05, "loss": 1.8056, "step": 251 }, { "epoch": 0.06, "grad_norm": 0.25918034177581145, "learning_rate": 9.3659281894576e-05, "loss": 2.3148, "step": 252 }, { "epoch": 0.06, "grad_norm": 0.24306255351316375, "learning_rate": 9.363381716322893e-05, "loss": 1.9831, "step": 253 }, { "epoch": 0.06, "grad_norm": 0.2575646567417516, "learning_rate": 9.360835243188186e-05, "loss": 1.8934, "step": 254 }, { "epoch": 0.06, "grad_norm": 0.23062400114675347, "learning_rate": 9.358288770053476e-05, "loss": 2.0416, "step": 255 }, { "epoch": 0.07, "grad_norm": 0.24386803477315933, "learning_rate": 9.355742296918768e-05, "loss": 1.7916, "step": 256 }, { "epoch": 0.07, "grad_norm": 0.21974957663492034, "learning_rate": 9.35319582378406e-05, "loss": 1.9653, "step": 257 }, { "epoch": 0.07, "grad_norm": 0.21492461503340388, "learning_rate": 9.35064935064935e-05, "loss": 1.7282, "step": 258 }, { "epoch": 0.07, "grad_norm": 0.22698493741125245, "learning_rate": 9.348102877514643e-05, "loss": 1.8432, "step": 259 }, { "epoch": 0.07, "grad_norm": 0.20285954001886503, "learning_rate": 9.345556404379934e-05, "loss": 1.7529, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.22456986452974662, "learning_rate": 9.343009931245226e-05, "loss": 1.8794, "step": 261 }, { "epoch": 0.07, "grad_norm": 0.22394667177371683, "learning_rate": 9.340463458110518e-05, "loss": 1.8886, "step": 262 }, { "epoch": 0.07, "grad_norm": 0.22092161082335537, "learning_rate": 9.337916984975808e-05, "loss": 2.0836, "step": 263 }, { "epoch": 0.07, "grad_norm": 0.20578946766995015, "learning_rate": 9.3353705118411e-05, "loss": 1.5635, "step": 264 }, { "epoch": 0.07, "grad_norm": 0.20675349046696734, "learning_rate": 9.332824038706393e-05, "loss": 1.9682, "step": 265 }, { "epoch": 0.07, "grad_norm": 0.23204721980255197, "learning_rate": 9.330277565571683e-05, "loss": 2.1915, "step": 266 }, { "epoch": 0.07, "grad_norm": 0.2634111457529946, "learning_rate": 9.327731092436976e-05, "loss": 1.7796, "step": 267 }, { "epoch": 0.07, "grad_norm": 0.2743533600157014, "learning_rate": 9.325184619302267e-05, "loss": 1.6329, "step": 268 }, { "epoch": 0.07, "grad_norm": 0.2420317702906553, "learning_rate": 9.322638146167558e-05, "loss": 1.6665, "step": 269 }, { "epoch": 0.07, "grad_norm": 0.19690034112871688, "learning_rate": 9.32009167303285e-05, "loss": 1.8188, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.2416787341135894, "learning_rate": 9.317545199898142e-05, "loss": 1.9407, "step": 271 }, { "epoch": 0.07, "grad_norm": 0.21190598468063523, "learning_rate": 9.314998726763433e-05, "loss": 1.8868, "step": 272 }, { "epoch": 0.07, "grad_norm": 0.24950943440028028, "learning_rate": 9.312452253628725e-05, "loss": 1.9476, "step": 273 }, { "epoch": 0.07, "grad_norm": 0.21485197745088533, "learning_rate": 9.309905780494015e-05, "loss": 1.6875, "step": 274 }, { "epoch": 0.07, "grad_norm": 0.2398399666478477, "learning_rate": 9.307359307359308e-05, "loss": 1.8274, "step": 275 }, { "epoch": 0.07, "grad_norm": 0.24735280047478256, "learning_rate": 9.3048128342246e-05, "loss": 2.009, "step": 276 }, { "epoch": 0.07, "grad_norm": 0.2631694952704255, "learning_rate": 9.30226636108989e-05, "loss": 1.97, "step": 277 }, { "epoch": 0.07, "grad_norm": 0.19990147682229312, "learning_rate": 9.299719887955183e-05, "loss": 1.5918, "step": 278 }, { "epoch": 0.07, "grad_norm": 0.19055219768739767, "learning_rate": 9.297173414820474e-05, "loss": 1.5943, "step": 279 }, { "epoch": 0.07, "grad_norm": 0.23155827723062083, "learning_rate": 9.294626941685765e-05, "loss": 1.7937, "step": 280 }, { "epoch": 0.07, "grad_norm": 0.2576038643469748, "learning_rate": 9.292080468551058e-05, "loss": 2.0919, "step": 281 }, { "epoch": 0.07, "grad_norm": 0.22399345203923643, "learning_rate": 9.289533995416349e-05, "loss": 1.6301, "step": 282 }, { "epoch": 0.07, "grad_norm": 0.22107518323709327, "learning_rate": 9.28698752228164e-05, "loss": 1.6966, "step": 283 }, { "epoch": 0.07, "grad_norm": 0.2091183455822762, "learning_rate": 9.284441049146933e-05, "loss": 1.8305, "step": 284 }, { "epoch": 0.07, "grad_norm": 0.21341806222699425, "learning_rate": 9.281894576012223e-05, "loss": 1.7999, "step": 285 }, { "epoch": 0.07, "grad_norm": 0.2196306400731007, "learning_rate": 9.279348102877515e-05, "loss": 1.9407, "step": 286 }, { "epoch": 0.07, "grad_norm": 0.24222282951836133, "learning_rate": 9.276801629742806e-05, "loss": 1.7496, "step": 287 }, { "epoch": 0.07, "grad_norm": 0.2120531741834089, "learning_rate": 9.274255156608098e-05, "loss": 1.4856, "step": 288 }, { "epoch": 0.07, "grad_norm": 0.25255465595815, "learning_rate": 9.27170868347339e-05, "loss": 1.9091, "step": 289 }, { "epoch": 0.07, "grad_norm": 0.23932122149332355, "learning_rate": 9.269162210338681e-05, "loss": 1.9319, "step": 290 }, { "epoch": 0.07, "grad_norm": 0.2251063529297804, "learning_rate": 9.266615737203973e-05, "loss": 1.8085, "step": 291 }, { "epoch": 0.07, "grad_norm": 0.24473398201937743, "learning_rate": 9.264069264069265e-05, "loss": 2.005, "step": 292 }, { "epoch": 0.07, "grad_norm": 0.2465717275751802, "learning_rate": 9.261522790934556e-05, "loss": 2.1889, "step": 293 }, { "epoch": 0.07, "grad_norm": 0.2842685521193732, "learning_rate": 9.258976317799848e-05, "loss": 1.9895, "step": 294 }, { "epoch": 0.08, "grad_norm": 0.2375850983047579, "learning_rate": 9.256429844665139e-05, "loss": 1.8794, "step": 295 }, { "epoch": 0.08, "grad_norm": 0.22515910503528352, "learning_rate": 9.253883371530431e-05, "loss": 1.8471, "step": 296 }, { "epoch": 0.08, "grad_norm": 0.20256419894910435, "learning_rate": 9.251336898395723e-05, "loss": 1.9121, "step": 297 }, { "epoch": 0.08, "grad_norm": 0.19414163585376648, "learning_rate": 9.248790425261014e-05, "loss": 1.754, "step": 298 }, { "epoch": 0.08, "grad_norm": 0.2590600520249542, "learning_rate": 9.246243952126305e-05, "loss": 1.9198, "step": 299 }, { "epoch": 0.08, "grad_norm": 0.2186561110505484, "learning_rate": 9.243697478991598e-05, "loss": 1.8622, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.2325676052442486, "learning_rate": 9.241151005856889e-05, "loss": 2.038, "step": 301 }, { "epoch": 0.08, "grad_norm": 0.3049684598514628, "learning_rate": 9.23860453272218e-05, "loss": 2.2992, "step": 302 }, { "epoch": 0.08, "grad_norm": 0.2632339372769938, "learning_rate": 9.236058059587471e-05, "loss": 1.9476, "step": 303 }, { "epoch": 0.08, "grad_norm": 0.21988150326001385, "learning_rate": 9.233511586452764e-05, "loss": 1.8327, "step": 304 }, { "epoch": 0.08, "grad_norm": 0.2642056043525351, "learning_rate": 9.230965113318055e-05, "loss": 1.8121, "step": 305 }, { "epoch": 0.08, "grad_norm": 0.2752051032919026, "learning_rate": 9.228418640183346e-05, "loss": 1.9429, "step": 306 }, { "epoch": 0.08, "grad_norm": 0.23849414476318845, "learning_rate": 9.225872167048639e-05, "loss": 1.858, "step": 307 }, { "epoch": 0.08, "grad_norm": 0.264730461894566, "learning_rate": 9.22332569391393e-05, "loss": 2.2006, "step": 308 }, { "epoch": 0.08, "grad_norm": 0.23811792028971387, "learning_rate": 9.220779220779221e-05, "loss": 2.0294, "step": 309 }, { "epoch": 0.08, "grad_norm": 0.24963177444652385, "learning_rate": 9.218232747644512e-05, "loss": 1.8812, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.21743851296807606, "learning_rate": 9.215686274509804e-05, "loss": 1.8752, "step": 311 }, { "epoch": 0.08, "grad_norm": 0.207702098647325, "learning_rate": 9.213139801375096e-05, "loss": 1.8431, "step": 312 }, { "epoch": 0.08, "grad_norm": 0.2536897469848907, "learning_rate": 9.210593328240387e-05, "loss": 2.344, "step": 313 }, { "epoch": 0.08, "grad_norm": 0.2527882537720398, "learning_rate": 9.208046855105679e-05, "loss": 2.1259, "step": 314 }, { "epoch": 0.08, "grad_norm": 0.22217569861745942, "learning_rate": 9.205500381970971e-05, "loss": 1.9742, "step": 315 }, { "epoch": 0.08, "grad_norm": 0.23149605689696912, "learning_rate": 9.202953908836262e-05, "loss": 1.8699, "step": 316 }, { "epoch": 0.08, "grad_norm": 0.2558552399512293, "learning_rate": 9.200407435701554e-05, "loss": 2.0121, "step": 317 }, { "epoch": 0.08, "grad_norm": 0.2815150495507308, "learning_rate": 9.197860962566846e-05, "loss": 1.8827, "step": 318 }, { "epoch": 0.08, "grad_norm": 0.24109724971829488, "learning_rate": 9.195314489432137e-05, "loss": 2.034, "step": 319 }, { "epoch": 0.08, "grad_norm": 0.21261453662280155, "learning_rate": 9.192768016297429e-05, "loss": 1.5434, "step": 320 }, { "epoch": 0.08, "grad_norm": 0.23790910458001532, "learning_rate": 9.19022154316272e-05, "loss": 1.9696, "step": 321 }, { "epoch": 0.08, "grad_norm": 0.21656851727106363, "learning_rate": 9.187675070028011e-05, "loss": 1.7906, "step": 322 }, { "epoch": 0.08, "grad_norm": 0.24685226589261666, "learning_rate": 9.185128596893303e-05, "loss": 2.0424, "step": 323 }, { "epoch": 0.08, "grad_norm": 0.22259924811476584, "learning_rate": 9.182582123758595e-05, "loss": 1.8244, "step": 324 }, { "epoch": 0.08, "grad_norm": 0.2196586079950527, "learning_rate": 9.180035650623886e-05, "loss": 1.8852, "step": 325 }, { "epoch": 0.08, "grad_norm": 0.24882911130988142, "learning_rate": 9.177489177489178e-05, "loss": 2.1478, "step": 326 }, { "epoch": 0.08, "grad_norm": 0.26072995732231113, "learning_rate": 9.17494270435447e-05, "loss": 1.7825, "step": 327 }, { "epoch": 0.08, "grad_norm": 0.22901788762404968, "learning_rate": 9.172396231219761e-05, "loss": 2.0263, "step": 328 }, { "epoch": 0.08, "grad_norm": 0.3027309786698346, "learning_rate": 9.169849758085053e-05, "loss": 1.6272, "step": 329 }, { "epoch": 0.08, "grad_norm": 0.2426624716595467, "learning_rate": 9.167303284950343e-05, "loss": 1.8666, "step": 330 }, { "epoch": 0.08, "grad_norm": 0.30318419969720933, "learning_rate": 9.164756811815636e-05, "loss": 1.8957, "step": 331 }, { "epoch": 0.08, "grad_norm": 0.2620787351070465, "learning_rate": 9.162210338680927e-05, "loss": 1.9753, "step": 332 }, { "epoch": 0.08, "grad_norm": 0.23925396663143897, "learning_rate": 9.159663865546218e-05, "loss": 1.8427, "step": 333 }, { "epoch": 0.09, "grad_norm": 0.25543502386679007, "learning_rate": 9.157117392411511e-05, "loss": 2.0556, "step": 334 }, { "epoch": 0.09, "grad_norm": 0.25514597844638615, "learning_rate": 9.154570919276802e-05, "loss": 1.8451, "step": 335 }, { "epoch": 0.09, "grad_norm": 0.2341678180502237, "learning_rate": 9.152024446142093e-05, "loss": 1.9502, "step": 336 }, { "epoch": 0.09, "grad_norm": 0.2529748559630748, "learning_rate": 9.149477973007386e-05, "loss": 1.9665, "step": 337 }, { "epoch": 0.09, "grad_norm": 0.26268342852744336, "learning_rate": 9.146931499872676e-05, "loss": 2.045, "step": 338 }, { "epoch": 0.09, "grad_norm": 0.22052807744368427, "learning_rate": 9.144385026737968e-05, "loss": 1.8048, "step": 339 }, { "epoch": 0.09, "grad_norm": 0.2763289570061149, "learning_rate": 9.141838553603261e-05, "loss": 2.1017, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.3086075236469837, "learning_rate": 9.13929208046855e-05, "loss": 1.8693, "step": 341 }, { "epoch": 0.09, "grad_norm": 0.24003803945126273, "learning_rate": 9.136745607333843e-05, "loss": 1.8391, "step": 342 }, { "epoch": 0.09, "grad_norm": 0.23133864540209823, "learning_rate": 9.134199134199136e-05, "loss": 1.9012, "step": 343 }, { "epoch": 0.09, "grad_norm": 0.2897567659629809, "learning_rate": 9.131652661064426e-05, "loss": 1.831, "step": 344 }, { "epoch": 0.09, "grad_norm": 0.22979690936728908, "learning_rate": 9.129106187929718e-05, "loss": 1.7455, "step": 345 }, { "epoch": 0.09, "grad_norm": 0.24403021795538454, "learning_rate": 9.12655971479501e-05, "loss": 1.7353, "step": 346 }, { "epoch": 0.09, "grad_norm": 0.21211903411341312, "learning_rate": 9.1240132416603e-05, "loss": 1.9091, "step": 347 }, { "epoch": 0.09, "grad_norm": 0.2465935581647895, "learning_rate": 9.121466768525593e-05, "loss": 1.9653, "step": 348 }, { "epoch": 0.09, "grad_norm": 0.22337100266582047, "learning_rate": 9.118920295390883e-05, "loss": 1.9857, "step": 349 }, { "epoch": 0.09, "grad_norm": 0.19762650141546803, "learning_rate": 9.116373822256176e-05, "loss": 1.6218, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.27535143363690945, "learning_rate": 9.113827349121468e-05, "loss": 1.8376, "step": 351 }, { "epoch": 0.09, "grad_norm": 0.24252525742486392, "learning_rate": 9.111280875986758e-05, "loss": 1.8176, "step": 352 }, { "epoch": 0.09, "grad_norm": 0.23695886790494372, "learning_rate": 9.10873440285205e-05, "loss": 1.7958, "step": 353 }, { "epoch": 0.09, "grad_norm": 0.21746293413915188, "learning_rate": 9.106187929717343e-05, "loss": 1.667, "step": 354 }, { "epoch": 0.09, "grad_norm": 0.3088650054487904, "learning_rate": 9.103641456582633e-05, "loss": 1.9392, "step": 355 }, { "epoch": 0.09, "grad_norm": 0.1965171096562478, "learning_rate": 9.101094983447926e-05, "loss": 1.7628, "step": 356 }, { "epoch": 0.09, "grad_norm": 0.2238052486293197, "learning_rate": 9.098548510313217e-05, "loss": 1.8061, "step": 357 }, { "epoch": 0.09, "grad_norm": 0.23181591059915962, "learning_rate": 9.096002037178508e-05, "loss": 1.9798, "step": 358 }, { "epoch": 0.09, "grad_norm": 0.2392610666481906, "learning_rate": 9.0934555640438e-05, "loss": 1.9227, "step": 359 }, { "epoch": 0.09, "grad_norm": 0.250920285695026, "learning_rate": 9.090909090909092e-05, "loss": 2.0199, "step": 360 }, { "epoch": 0.09, "grad_norm": 0.20542773464715974, "learning_rate": 9.088362617774383e-05, "loss": 2.0731, "step": 361 }, { "epoch": 0.09, "grad_norm": 0.19969186515394957, "learning_rate": 9.085816144639675e-05, "loss": 1.8649, "step": 362 }, { "epoch": 0.09, "grad_norm": 0.21939794762728568, "learning_rate": 9.083269671504965e-05, "loss": 1.9547, "step": 363 }, { "epoch": 0.09, "grad_norm": 0.18498324065396723, "learning_rate": 9.080723198370258e-05, "loss": 1.6511, "step": 364 }, { "epoch": 0.09, "grad_norm": 0.2395442415908442, "learning_rate": 9.078176725235549e-05, "loss": 2.0866, "step": 365 }, { "epoch": 0.09, "grad_norm": 0.2867061222424313, "learning_rate": 9.07563025210084e-05, "loss": 2.1337, "step": 366 }, { "epoch": 0.09, "grad_norm": 0.24312892338202968, "learning_rate": 9.073083778966133e-05, "loss": 2.0018, "step": 367 }, { "epoch": 0.09, "grad_norm": 0.21800031946097598, "learning_rate": 9.070537305831424e-05, "loss": 1.7939, "step": 368 }, { "epoch": 0.09, "grad_norm": 0.22455248734256647, "learning_rate": 9.067990832696715e-05, "loss": 1.8707, "step": 369 }, { "epoch": 0.09, "grad_norm": 0.19183669055751298, "learning_rate": 9.065444359562008e-05, "loss": 1.7357, "step": 370 }, { "epoch": 0.09, "grad_norm": 0.22691866404601918, "learning_rate": 9.062897886427299e-05, "loss": 1.7986, "step": 371 }, { "epoch": 0.09, "grad_norm": 0.30994248506537475, "learning_rate": 9.06035141329259e-05, "loss": 2.1003, "step": 372 }, { "epoch": 0.09, "grad_norm": 0.22813953523184213, "learning_rate": 9.057804940157881e-05, "loss": 1.9194, "step": 373 }, { "epoch": 0.1, "grad_norm": 0.23784922692149396, "learning_rate": 9.055258467023173e-05, "loss": 2.0995, "step": 374 }, { "epoch": 0.1, "grad_norm": 0.25676292571039067, "learning_rate": 9.052711993888465e-05, "loss": 2.1612, "step": 375 }, { "epoch": 0.1, "grad_norm": 0.25134486326781896, "learning_rate": 9.050165520753756e-05, "loss": 2.0511, "step": 376 }, { "epoch": 0.1, "grad_norm": 0.24854067963044174, "learning_rate": 9.047619047619048e-05, "loss": 1.9804, "step": 377 }, { "epoch": 0.1, "grad_norm": 0.23139343807085166, "learning_rate": 9.04507257448434e-05, "loss": 1.5953, "step": 378 }, { "epoch": 0.1, "grad_norm": 0.19988237438709283, "learning_rate": 9.042526101349631e-05, "loss": 1.7571, "step": 379 }, { "epoch": 0.1, "grad_norm": 0.23942742077516901, "learning_rate": 9.039979628214923e-05, "loss": 2.0776, "step": 380 }, { "epoch": 0.1, "grad_norm": 0.2313486291648339, "learning_rate": 9.037433155080214e-05, "loss": 1.6489, "step": 381 }, { "epoch": 0.1, "grad_norm": 0.21412353306850734, "learning_rate": 9.034886681945506e-05, "loss": 1.7505, "step": 382 }, { "epoch": 0.1, "grad_norm": 0.2854456804430116, "learning_rate": 9.032340208810798e-05, "loss": 2.1109, "step": 383 }, { "epoch": 0.1, "grad_norm": 0.253768766301195, "learning_rate": 9.029793735676089e-05, "loss": 1.8257, "step": 384 }, { "epoch": 0.1, "grad_norm": 0.2703400790748331, "learning_rate": 9.02724726254138e-05, "loss": 1.9159, "step": 385 }, { "epoch": 0.1, "grad_norm": 0.24359989557350126, "learning_rate": 9.024700789406673e-05, "loss": 1.9214, "step": 386 }, { "epoch": 0.1, "grad_norm": 0.25899432696845703, "learning_rate": 9.022154316271964e-05, "loss": 1.9816, "step": 387 }, { "epoch": 0.1, "grad_norm": 0.3254011608126227, "learning_rate": 9.019607843137255e-05, "loss": 1.7581, "step": 388 }, { "epoch": 0.1, "grad_norm": 0.2031342889422471, "learning_rate": 9.017061370002546e-05, "loss": 1.7662, "step": 389 }, { "epoch": 0.1, "grad_norm": 0.3026088700555451, "learning_rate": 9.014514896867839e-05, "loss": 2.2481, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.21346682694721678, "learning_rate": 9.01196842373313e-05, "loss": 1.9334, "step": 391 }, { "epoch": 0.1, "grad_norm": 0.25777758737626477, "learning_rate": 9.009421950598421e-05, "loss": 2.2532, "step": 392 }, { "epoch": 0.1, "grad_norm": 0.241481143173268, "learning_rate": 9.006875477463714e-05, "loss": 1.8002, "step": 393 }, { "epoch": 0.1, "grad_norm": 0.20162168232777278, "learning_rate": 9.004329004329005e-05, "loss": 1.8179, "step": 394 }, { "epoch": 0.1, "grad_norm": 0.24928963588818537, "learning_rate": 9.001782531194296e-05, "loss": 1.9853, "step": 395 }, { "epoch": 0.1, "grad_norm": 0.2562772090934246, "learning_rate": 8.999236058059587e-05, "loss": 1.991, "step": 396 }, { "epoch": 0.1, "grad_norm": 0.265292759285601, "learning_rate": 8.99668958492488e-05, "loss": 1.777, "step": 397 }, { "epoch": 0.1, "grad_norm": 0.22762612081280048, "learning_rate": 8.994143111790171e-05, "loss": 1.8467, "step": 398 }, { "epoch": 0.1, "grad_norm": 0.2363791292496596, "learning_rate": 8.991596638655462e-05, "loss": 1.7783, "step": 399 }, { "epoch": 0.1, "grad_norm": 0.24695737617258487, "learning_rate": 8.989050165520754e-05, "loss": 1.8617, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.25795499158430346, "learning_rate": 8.986503692386046e-05, "loss": 2.1168, "step": 401 }, { "epoch": 0.1, "grad_norm": 0.2367588799205056, "learning_rate": 8.983957219251337e-05, "loss": 1.686, "step": 402 }, { "epoch": 0.1, "grad_norm": 0.2486136187076788, "learning_rate": 8.981410746116629e-05, "loss": 1.6311, "step": 403 }, { "epoch": 0.1, "grad_norm": 0.23212428234792606, "learning_rate": 8.978864272981921e-05, "loss": 1.8845, "step": 404 }, { "epoch": 0.1, "grad_norm": 0.21297822252507279, "learning_rate": 8.976317799847212e-05, "loss": 1.6802, "step": 405 }, { "epoch": 0.1, "grad_norm": 0.22666768394766693, "learning_rate": 8.973771326712504e-05, "loss": 1.8839, "step": 406 }, { "epoch": 0.1, "grad_norm": 0.22721848395946773, "learning_rate": 8.971224853577795e-05, "loss": 1.7933, "step": 407 }, { "epoch": 0.1, "grad_norm": 0.23387883437850063, "learning_rate": 8.968678380443086e-05, "loss": 1.8834, "step": 408 }, { "epoch": 0.1, "grad_norm": 0.25587524481325125, "learning_rate": 8.966131907308379e-05, "loss": 1.9019, "step": 409 }, { "epoch": 0.1, "grad_norm": 0.22077238633383342, "learning_rate": 8.96358543417367e-05, "loss": 1.9587, "step": 410 }, { "epoch": 0.1, "grad_norm": 0.269133675406636, "learning_rate": 8.961038961038961e-05, "loss": 1.8813, "step": 411 }, { "epoch": 0.1, "grad_norm": 0.23159155269564086, "learning_rate": 8.958492487904253e-05, "loss": 2.0881, "step": 412 }, { "epoch": 0.11, "grad_norm": 0.28728695003647925, "learning_rate": 8.955946014769545e-05, "loss": 2.0738, "step": 413 }, { "epoch": 0.11, "grad_norm": 0.22292837445688052, "learning_rate": 8.953399541634836e-05, "loss": 1.7848, "step": 414 }, { "epoch": 0.11, "grad_norm": 0.25611957394245294, "learning_rate": 8.950853068500128e-05, "loss": 1.9309, "step": 415 }, { "epoch": 0.11, "grad_norm": 0.2808692394535083, "learning_rate": 8.948306595365418e-05, "loss": 2.0691, "step": 416 }, { "epoch": 0.11, "grad_norm": 0.24805891203021524, "learning_rate": 8.945760122230711e-05, "loss": 1.8176, "step": 417 }, { "epoch": 0.11, "grad_norm": 0.22788913957793705, "learning_rate": 8.943213649096003e-05, "loss": 2.0164, "step": 418 }, { "epoch": 0.11, "grad_norm": 0.28556400044197944, "learning_rate": 8.940667175961293e-05, "loss": 2.1959, "step": 419 }, { "epoch": 0.11, "grad_norm": 0.2865977497114976, "learning_rate": 8.938120702826586e-05, "loss": 1.9149, "step": 420 }, { "epoch": 0.11, "grad_norm": 0.23410808146142997, "learning_rate": 8.935574229691877e-05, "loss": 1.8804, "step": 421 }, { "epoch": 0.11, "grad_norm": 0.2320700474936451, "learning_rate": 8.933027756557168e-05, "loss": 1.6851, "step": 422 }, { "epoch": 0.11, "grad_norm": 0.23821295869621667, "learning_rate": 8.930481283422461e-05, "loss": 2.059, "step": 423 }, { "epoch": 0.11, "grad_norm": 0.22035404888955523, "learning_rate": 8.927934810287751e-05, "loss": 2.0662, "step": 424 }, { "epoch": 0.11, "grad_norm": 0.25753879467837276, "learning_rate": 8.925388337153043e-05, "loss": 1.7946, "step": 425 }, { "epoch": 0.11, "grad_norm": 0.23174398602498045, "learning_rate": 8.922841864018336e-05, "loss": 1.9452, "step": 426 }, { "epoch": 0.11, "grad_norm": 0.21423272026802384, "learning_rate": 8.920295390883626e-05, "loss": 1.8668, "step": 427 }, { "epoch": 0.11, "grad_norm": 0.23524905778157035, "learning_rate": 8.917748917748918e-05, "loss": 1.9177, "step": 428 }, { "epoch": 0.11, "grad_norm": 0.22128603765671084, "learning_rate": 8.915202444614211e-05, "loss": 1.6931, "step": 429 }, { "epoch": 0.11, "grad_norm": 0.30417705228572045, "learning_rate": 8.9126559714795e-05, "loss": 1.7556, "step": 430 }, { "epoch": 0.11, "grad_norm": 0.22680455464532032, "learning_rate": 8.910109498344793e-05, "loss": 1.8978, "step": 431 }, { "epoch": 0.11, "grad_norm": 0.22491518846108297, "learning_rate": 8.907563025210084e-05, "loss": 1.8708, "step": 432 }, { "epoch": 0.11, "grad_norm": 0.24514940513649147, "learning_rate": 8.905016552075376e-05, "loss": 2.0945, "step": 433 }, { "epoch": 0.11, "grad_norm": 0.3211911292812862, "learning_rate": 8.902470078940668e-05, "loss": 2.1676, "step": 434 }, { "epoch": 0.11, "grad_norm": 0.21672812222249246, "learning_rate": 8.89992360580596e-05, "loss": 1.9561, "step": 435 }, { "epoch": 0.11, "grad_norm": 0.24722686150361434, "learning_rate": 8.89737713267125e-05, "loss": 2.0444, "step": 436 }, { "epoch": 0.11, "grad_norm": 0.2407730235551704, "learning_rate": 8.894830659536543e-05, "loss": 2.0372, "step": 437 }, { "epoch": 0.11, "grad_norm": 0.21103154840876165, "learning_rate": 8.892284186401833e-05, "loss": 1.786, "step": 438 }, { "epoch": 0.11, "grad_norm": 0.2513228969486743, "learning_rate": 8.889737713267126e-05, "loss": 1.9824, "step": 439 }, { "epoch": 0.11, "grad_norm": 0.258544590475214, "learning_rate": 8.887191240132418e-05, "loss": 1.972, "step": 440 }, { "epoch": 0.11, "grad_norm": 0.22888813467895552, "learning_rate": 8.884644766997708e-05, "loss": 2.0106, "step": 441 }, { "epoch": 0.11, "grad_norm": 0.21185224484030166, "learning_rate": 8.882098293863e-05, "loss": 1.6985, "step": 442 }, { "epoch": 0.11, "grad_norm": 0.2505140384273364, "learning_rate": 8.879551820728292e-05, "loss": 1.7832, "step": 443 }, { "epoch": 0.11, "grad_norm": 0.2634044704116914, "learning_rate": 8.877005347593583e-05, "loss": 1.9525, "step": 444 }, { "epoch": 0.11, "grad_norm": 0.26359759096740165, "learning_rate": 8.874458874458876e-05, "loss": 1.8602, "step": 445 }, { "epoch": 0.11, "grad_norm": 0.22000572273007202, "learning_rate": 8.871912401324167e-05, "loss": 1.6984, "step": 446 }, { "epoch": 0.11, "grad_norm": 0.26066085175478015, "learning_rate": 8.869365928189458e-05, "loss": 2.0568, "step": 447 }, { "epoch": 0.11, "grad_norm": 0.2773800812686037, "learning_rate": 8.86681945505475e-05, "loss": 1.9775, "step": 448 }, { "epoch": 0.11, "grad_norm": 0.22358819774715658, "learning_rate": 8.86427298192004e-05, "loss": 1.8704, "step": 449 }, { "epoch": 0.11, "grad_norm": 0.3077196828448956, "learning_rate": 8.861726508785333e-05, "loss": 2.1512, "step": 450 }, { "epoch": 0.11, "grad_norm": 0.2786371669431707, "learning_rate": 8.859180035650624e-05, "loss": 2.0763, "step": 451 }, { "epoch": 0.12, "grad_norm": 0.22067992333181366, "learning_rate": 8.856633562515915e-05, "loss": 1.7935, "step": 452 }, { "epoch": 0.12, "grad_norm": 0.23456282242402895, "learning_rate": 8.854087089381208e-05, "loss": 1.8671, "step": 453 }, { "epoch": 0.12, "grad_norm": 0.26017027006661725, "learning_rate": 8.851540616246499e-05, "loss": 1.5871, "step": 454 }, { "epoch": 0.12, "grad_norm": 0.2605339390232834, "learning_rate": 8.84899414311179e-05, "loss": 2.0417, "step": 455 }, { "epoch": 0.12, "grad_norm": 0.2249779620901503, "learning_rate": 8.846447669977083e-05, "loss": 1.8832, "step": 456 }, { "epoch": 0.12, "grad_norm": 0.23915839952016046, "learning_rate": 8.843901196842374e-05, "loss": 1.8563, "step": 457 }, { "epoch": 0.12, "grad_norm": 0.2620073406978356, "learning_rate": 8.841354723707665e-05, "loss": 1.9188, "step": 458 }, { "epoch": 0.12, "grad_norm": 0.21269945433115764, "learning_rate": 8.838808250572957e-05, "loss": 1.5572, "step": 459 }, { "epoch": 0.12, "grad_norm": 0.28633311164541697, "learning_rate": 8.836261777438248e-05, "loss": 2.0076, "step": 460 }, { "epoch": 0.12, "grad_norm": 0.2578739179800228, "learning_rate": 8.83371530430354e-05, "loss": 2.0649, "step": 461 }, { "epoch": 0.12, "grad_norm": 0.23703436505578596, "learning_rate": 8.831168831168831e-05, "loss": 1.8394, "step": 462 }, { "epoch": 0.12, "grad_norm": 0.27268398548268413, "learning_rate": 8.828622358034123e-05, "loss": 2.0113, "step": 463 }, { "epoch": 0.12, "grad_norm": 0.22900004271804628, "learning_rate": 8.826075884899415e-05, "loss": 1.8828, "step": 464 }, { "epoch": 0.12, "grad_norm": 0.2886673959221963, "learning_rate": 8.823529411764706e-05, "loss": 1.8799, "step": 465 }, { "epoch": 0.12, "grad_norm": 0.2320470672893286, "learning_rate": 8.820982938629998e-05, "loss": 1.9301, "step": 466 }, { "epoch": 0.12, "grad_norm": 0.26738534373052875, "learning_rate": 8.818436465495289e-05, "loss": 2.0133, "step": 467 }, { "epoch": 0.12, "grad_norm": 0.25844804859889603, "learning_rate": 8.815889992360581e-05, "loss": 1.9833, "step": 468 }, { "epoch": 0.12, "grad_norm": 0.25046136127694163, "learning_rate": 8.813343519225873e-05, "loss": 1.7024, "step": 469 }, { "epoch": 0.12, "grad_norm": 0.2430617795253286, "learning_rate": 8.810797046091164e-05, "loss": 1.801, "step": 470 }, { "epoch": 0.12, "grad_norm": 0.2419599648636819, "learning_rate": 8.808250572956455e-05, "loss": 1.8624, "step": 471 }, { "epoch": 0.12, "grad_norm": 0.2659463341273293, "learning_rate": 8.805704099821748e-05, "loss": 2.0745, "step": 472 }, { "epoch": 0.12, "grad_norm": 0.24013448980226984, "learning_rate": 8.803157626687039e-05, "loss": 1.7037, "step": 473 }, { "epoch": 0.12, "grad_norm": 0.23022342815075916, "learning_rate": 8.80061115355233e-05, "loss": 1.8988, "step": 474 }, { "epoch": 0.12, "grad_norm": 0.2367004267836987, "learning_rate": 8.798064680417623e-05, "loss": 1.8279, "step": 475 }, { "epoch": 0.12, "grad_norm": 0.2620297477283304, "learning_rate": 8.795518207282914e-05, "loss": 1.8255, "step": 476 }, { "epoch": 0.12, "grad_norm": 0.2547679218327907, "learning_rate": 8.792971734148205e-05, "loss": 1.9405, "step": 477 }, { "epoch": 0.12, "grad_norm": 0.24895355889731416, "learning_rate": 8.790425261013496e-05, "loss": 1.9529, "step": 478 }, { "epoch": 0.12, "grad_norm": 0.28745180586441543, "learning_rate": 8.787878787878789e-05, "loss": 1.8011, "step": 479 }, { "epoch": 0.12, "grad_norm": 0.2878862108904096, "learning_rate": 8.78533231474408e-05, "loss": 2.0354, "step": 480 }, { "epoch": 0.12, "grad_norm": 0.25603030952198, "learning_rate": 8.782785841609371e-05, "loss": 2.0772, "step": 481 }, { "epoch": 0.12, "grad_norm": 0.23827494941261018, "learning_rate": 8.780239368474664e-05, "loss": 2.2084, "step": 482 }, { "epoch": 0.12, "grad_norm": 0.20837960126816166, "learning_rate": 8.777692895339955e-05, "loss": 1.5153, "step": 483 }, { "epoch": 0.12, "grad_norm": 0.20977901295253784, "learning_rate": 8.775146422205246e-05, "loss": 1.953, "step": 484 }, { "epoch": 0.12, "grad_norm": 0.2803722831995815, "learning_rate": 8.772599949070537e-05, "loss": 2.0062, "step": 485 }, { "epoch": 0.12, "grad_norm": 0.31592634092672583, "learning_rate": 8.770053475935829e-05, "loss": 1.9259, "step": 486 }, { "epoch": 0.12, "grad_norm": 0.20853917661705262, "learning_rate": 8.767507002801121e-05, "loss": 1.7843, "step": 487 }, { "epoch": 0.12, "grad_norm": 0.2622256707469382, "learning_rate": 8.764960529666412e-05, "loss": 1.6427, "step": 488 }, { "epoch": 0.12, "grad_norm": 0.26821743683336324, "learning_rate": 8.762414056531704e-05, "loss": 1.797, "step": 489 }, { "epoch": 0.12, "grad_norm": 0.29751845947665584, "learning_rate": 8.759867583396996e-05, "loss": 2.0772, "step": 490 }, { "epoch": 0.12, "grad_norm": 0.2499209815555211, "learning_rate": 8.757321110262287e-05, "loss": 2.2188, "step": 491 }, { "epoch": 0.13, "grad_norm": 0.2432774199108946, "learning_rate": 8.754774637127579e-05, "loss": 1.7888, "step": 492 }, { "epoch": 0.13, "grad_norm": 0.2342067273383554, "learning_rate": 8.752228163992871e-05, "loss": 1.9258, "step": 493 }, { "epoch": 0.13, "grad_norm": 0.262556869159515, "learning_rate": 8.749681690858161e-05, "loss": 2.0776, "step": 494 }, { "epoch": 0.13, "grad_norm": 0.21427865705052013, "learning_rate": 8.747135217723454e-05, "loss": 1.9552, "step": 495 }, { "epoch": 0.13, "grad_norm": 0.2385128880858244, "learning_rate": 8.744588744588745e-05, "loss": 2.0085, "step": 496 }, { "epoch": 0.13, "grad_norm": 0.19959484390847743, "learning_rate": 8.742042271454036e-05, "loss": 1.6707, "step": 497 }, { "epoch": 0.13, "grad_norm": 0.2590217264205479, "learning_rate": 8.739495798319329e-05, "loss": 2.0314, "step": 498 }, { "epoch": 0.13, "grad_norm": 0.24982669903351035, "learning_rate": 8.73694932518462e-05, "loss": 1.6773, "step": 499 }, { "epoch": 0.13, "grad_norm": 0.26589731674888417, "learning_rate": 8.734402852049911e-05, "loss": 2.1827, "step": 500 }, { "epoch": 0.13, "grad_norm": 0.2413060236399928, "learning_rate": 8.731856378915203e-05, "loss": 1.9234, "step": 501 }, { "epoch": 0.13, "grad_norm": 0.25943996695878224, "learning_rate": 8.729309905780493e-05, "loss": 2.064, "step": 502 }, { "epoch": 0.13, "grad_norm": 0.2362646853207089, "learning_rate": 8.726763432645786e-05, "loss": 1.736, "step": 503 }, { "epoch": 0.13, "grad_norm": 0.5992785606723764, "learning_rate": 8.724216959511078e-05, "loss": 2.0825, "step": 504 }, { "epoch": 0.13, "grad_norm": 0.2540455949327468, "learning_rate": 8.721670486376368e-05, "loss": 1.9849, "step": 505 }, { "epoch": 0.13, "grad_norm": 0.28364373494321304, "learning_rate": 8.719124013241661e-05, "loss": 1.9012, "step": 506 }, { "epoch": 0.13, "grad_norm": 0.24171865365853143, "learning_rate": 8.716577540106952e-05, "loss": 2.0105, "step": 507 }, { "epoch": 0.13, "grad_norm": 0.26652123215640866, "learning_rate": 8.714031066972243e-05, "loss": 1.7502, "step": 508 }, { "epoch": 0.13, "grad_norm": 0.24173515900322068, "learning_rate": 8.711484593837536e-05, "loss": 1.9814, "step": 509 }, { "epoch": 0.13, "grad_norm": 0.23354274486346174, "learning_rate": 8.708938120702827e-05, "loss": 1.7613, "step": 510 }, { "epoch": 0.13, "grad_norm": 0.2443504042577395, "learning_rate": 8.706391647568118e-05, "loss": 1.6964, "step": 511 }, { "epoch": 0.13, "grad_norm": 0.24122925352912084, "learning_rate": 8.703845174433411e-05, "loss": 1.8006, "step": 512 }, { "epoch": 0.13, "grad_norm": 0.2333847584566521, "learning_rate": 8.701298701298701e-05, "loss": 1.8593, "step": 513 }, { "epoch": 0.13, "grad_norm": 0.2279094660924481, "learning_rate": 8.698752228163993e-05, "loss": 1.8878, "step": 514 }, { "epoch": 0.13, "grad_norm": 0.2527167996599817, "learning_rate": 8.696205755029286e-05, "loss": 1.8256, "step": 515 }, { "epoch": 0.13, "grad_norm": 0.2681148186553472, "learning_rate": 8.693659281894576e-05, "loss": 1.8707, "step": 516 }, { "epoch": 0.13, "grad_norm": 0.23937145570416912, "learning_rate": 8.691112808759868e-05, "loss": 1.9254, "step": 517 }, { "epoch": 0.13, "grad_norm": 0.2501936549921692, "learning_rate": 8.68856633562516e-05, "loss": 1.9475, "step": 518 }, { "epoch": 0.13, "grad_norm": 0.24358617767152815, "learning_rate": 8.68601986249045e-05, "loss": 2.0011, "step": 519 }, { "epoch": 0.13, "grad_norm": 0.24964833858406257, "learning_rate": 8.683473389355743e-05, "loss": 1.9767, "step": 520 }, { "epoch": 0.13, "grad_norm": 0.22841990332662865, "learning_rate": 8.680926916221034e-05, "loss": 1.8258, "step": 521 }, { "epoch": 0.13, "grad_norm": 0.2216953490518328, "learning_rate": 8.678380443086326e-05, "loss": 1.8854, "step": 522 }, { "epoch": 0.13, "grad_norm": 0.21155130303192776, "learning_rate": 8.675833969951618e-05, "loss": 1.6807, "step": 523 }, { "epoch": 0.13, "grad_norm": 0.2564998792365461, "learning_rate": 8.673287496816908e-05, "loss": 2.0952, "step": 524 }, { "epoch": 0.13, "grad_norm": 0.32649142208193577, "learning_rate": 8.6707410236822e-05, "loss": 2.3346, "step": 525 }, { "epoch": 0.13, "grad_norm": 0.22465641524281874, "learning_rate": 8.668194550547493e-05, "loss": 1.876, "step": 526 }, { "epoch": 0.13, "grad_norm": 0.23566365423282545, "learning_rate": 8.665648077412783e-05, "loss": 1.7912, "step": 527 }, { "epoch": 0.13, "grad_norm": 0.2426874706745744, "learning_rate": 8.663101604278076e-05, "loss": 2.1064, "step": 528 }, { "epoch": 0.13, "grad_norm": 0.256102526492817, "learning_rate": 8.660555131143367e-05, "loss": 1.7732, "step": 529 }, { "epoch": 0.13, "grad_norm": 0.22674630371643414, "learning_rate": 8.658008658008658e-05, "loss": 1.8903, "step": 530 }, { "epoch": 0.14, "grad_norm": 0.3223009379141864, "learning_rate": 8.65546218487395e-05, "loss": 2.0436, "step": 531 }, { "epoch": 0.14, "grad_norm": 0.253161114600626, "learning_rate": 8.652915711739242e-05, "loss": 1.9741, "step": 532 }, { "epoch": 0.14, "grad_norm": 0.23017841575516462, "learning_rate": 8.650369238604533e-05, "loss": 1.7193, "step": 533 }, { "epoch": 0.14, "grad_norm": 0.23793766389765506, "learning_rate": 8.647822765469826e-05, "loss": 1.8982, "step": 534 }, { "epoch": 0.14, "grad_norm": 0.24108078568832444, "learning_rate": 8.645276292335115e-05, "loss": 1.8601, "step": 535 }, { "epoch": 0.14, "grad_norm": 0.21032110859794031, "learning_rate": 8.642729819200408e-05, "loss": 1.7736, "step": 536 }, { "epoch": 0.14, "grad_norm": 0.25852270274014505, "learning_rate": 8.640183346065699e-05, "loss": 2.2393, "step": 537 }, { "epoch": 0.14, "grad_norm": 0.21256717118119425, "learning_rate": 8.63763687293099e-05, "loss": 1.7333, "step": 538 }, { "epoch": 0.14, "grad_norm": 0.24096739207525672, "learning_rate": 8.635090399796283e-05, "loss": 1.8944, "step": 539 }, { "epoch": 0.14, "grad_norm": 0.25287979694385604, "learning_rate": 8.632543926661574e-05, "loss": 1.9336, "step": 540 }, { "epoch": 0.14, "grad_norm": 0.2716083517972064, "learning_rate": 8.629997453526865e-05, "loss": 2.0506, "step": 541 }, { "epoch": 0.14, "grad_norm": 0.2370221846524856, "learning_rate": 8.627450980392158e-05, "loss": 1.8359, "step": 542 }, { "epoch": 0.14, "grad_norm": 0.24195282856938768, "learning_rate": 8.624904507257449e-05, "loss": 1.8325, "step": 543 }, { "epoch": 0.14, "grad_norm": 0.28289711302928944, "learning_rate": 8.62235803412274e-05, "loss": 1.9292, "step": 544 }, { "epoch": 0.14, "grad_norm": 0.24438103092781144, "learning_rate": 8.619811560988032e-05, "loss": 1.5135, "step": 545 }, { "epoch": 0.14, "grad_norm": 0.2214814446606962, "learning_rate": 8.617265087853324e-05, "loss": 1.8107, "step": 546 }, { "epoch": 0.14, "grad_norm": 0.2495619616900341, "learning_rate": 8.614718614718615e-05, "loss": 1.8602, "step": 547 }, { "epoch": 0.14, "grad_norm": 0.25771399263311523, "learning_rate": 8.612172141583907e-05, "loss": 1.8007, "step": 548 }, { "epoch": 0.14, "grad_norm": 0.24994041060517444, "learning_rate": 8.609625668449198e-05, "loss": 1.9156, "step": 549 }, { "epoch": 0.14, "grad_norm": 0.22366784167075848, "learning_rate": 8.60707919531449e-05, "loss": 1.9989, "step": 550 }, { "epoch": 0.14, "grad_norm": 0.2630253246918491, "learning_rate": 8.604532722179781e-05, "loss": 1.6313, "step": 551 }, { "epoch": 0.14, "grad_norm": 0.26379510331153455, "learning_rate": 8.601986249045073e-05, "loss": 1.7552, "step": 552 }, { "epoch": 0.14, "grad_norm": 0.2079340754806867, "learning_rate": 8.599439775910365e-05, "loss": 2.0048, "step": 553 }, { "epoch": 0.14, "grad_norm": 0.21692329020273132, "learning_rate": 8.596893302775656e-05, "loss": 1.9039, "step": 554 }, { "epoch": 0.14, "grad_norm": 0.22066896731286834, "learning_rate": 8.594346829640948e-05, "loss": 1.7687, "step": 555 }, { "epoch": 0.14, "grad_norm": 0.22092093632239163, "learning_rate": 8.591800356506239e-05, "loss": 1.5822, "step": 556 }, { "epoch": 0.14, "grad_norm": 0.26549194313206226, "learning_rate": 8.589253883371531e-05, "loss": 1.9684, "step": 557 }, { "epoch": 0.14, "grad_norm": 0.2781915333279139, "learning_rate": 8.586707410236823e-05, "loss": 2.1979, "step": 558 }, { "epoch": 0.14, "grad_norm": 0.2516189251283423, "learning_rate": 8.584160937102114e-05, "loss": 2.0177, "step": 559 }, { "epoch": 0.14, "grad_norm": 0.22633635810464564, "learning_rate": 8.581614463967405e-05, "loss": 1.7088, "step": 560 }, { "epoch": 0.14, "grad_norm": 0.25571341391614216, "learning_rate": 8.579067990832698e-05, "loss": 1.666, "step": 561 }, { "epoch": 0.14, "grad_norm": 0.24154109213497876, "learning_rate": 8.576521517697989e-05, "loss": 1.9523, "step": 562 }, { "epoch": 0.14, "grad_norm": 0.198164384354011, "learning_rate": 8.57397504456328e-05, "loss": 1.8036, "step": 563 }, { "epoch": 0.14, "grad_norm": 0.25184206030165557, "learning_rate": 8.571428571428571e-05, "loss": 1.9446, "step": 564 }, { "epoch": 0.14, "grad_norm": 0.2573172407271812, "learning_rate": 8.568882098293864e-05, "loss": 1.9315, "step": 565 }, { "epoch": 0.14, "grad_norm": 0.2696812392046249, "learning_rate": 8.566335625159155e-05, "loss": 1.8225, "step": 566 }, { "epoch": 0.14, "grad_norm": 0.25286593425375464, "learning_rate": 8.563789152024446e-05, "loss": 2.0231, "step": 567 }, { "epoch": 0.14, "grad_norm": 0.22767392856341184, "learning_rate": 8.561242678889739e-05, "loss": 1.7285, "step": 568 }, { "epoch": 0.14, "grad_norm": 0.24801175927731736, "learning_rate": 8.55869620575503e-05, "loss": 1.6252, "step": 569 }, { "epoch": 0.15, "grad_norm": 0.21808987878527664, "learning_rate": 8.556149732620321e-05, "loss": 1.7553, "step": 570 }, { "epoch": 0.15, "grad_norm": 0.2573754623320004, "learning_rate": 8.553603259485612e-05, "loss": 2.1009, "step": 571 }, { "epoch": 0.15, "grad_norm": 0.18939076213266787, "learning_rate": 8.551056786350904e-05, "loss": 1.686, "step": 572 }, { "epoch": 0.15, "grad_norm": 0.24812637016436956, "learning_rate": 8.548510313216196e-05, "loss": 1.9609, "step": 573 }, { "epoch": 0.15, "grad_norm": 0.2338966733848831, "learning_rate": 8.545963840081487e-05, "loss": 1.9818, "step": 574 }, { "epoch": 0.15, "grad_norm": 0.2404840149226457, "learning_rate": 8.543417366946779e-05, "loss": 1.8081, "step": 575 }, { "epoch": 0.15, "grad_norm": 0.25798725216628743, "learning_rate": 8.540870893812071e-05, "loss": 1.7854, "step": 576 }, { "epoch": 0.15, "grad_norm": 0.27750663272560383, "learning_rate": 8.538324420677362e-05, "loss": 2.0337, "step": 577 }, { "epoch": 0.15, "grad_norm": 0.2769580899803715, "learning_rate": 8.535777947542654e-05, "loss": 1.9486, "step": 578 }, { "epoch": 0.15, "grad_norm": 0.25306861331816644, "learning_rate": 8.533231474407946e-05, "loss": 1.7939, "step": 579 }, { "epoch": 0.15, "grad_norm": 0.22845769620715287, "learning_rate": 8.530685001273236e-05, "loss": 1.8121, "step": 580 }, { "epoch": 0.15, "grad_norm": 0.2595561480765673, "learning_rate": 8.528138528138529e-05, "loss": 1.8217, "step": 581 }, { "epoch": 0.15, "grad_norm": 0.2548756068588673, "learning_rate": 8.52559205500382e-05, "loss": 1.7951, "step": 582 }, { "epoch": 0.15, "grad_norm": 0.23753990478065184, "learning_rate": 8.523045581869111e-05, "loss": 1.827, "step": 583 }, { "epoch": 0.15, "grad_norm": 0.2795844698118675, "learning_rate": 8.520499108734404e-05, "loss": 1.8995, "step": 584 }, { "epoch": 0.15, "grad_norm": 0.2650106702986557, "learning_rate": 8.517952635599695e-05, "loss": 1.8459, "step": 585 }, { "epoch": 0.15, "grad_norm": 0.2583049636819499, "learning_rate": 8.515406162464986e-05, "loss": 2.0852, "step": 586 }, { "epoch": 0.15, "grad_norm": 0.2544859357696881, "learning_rate": 8.512859689330279e-05, "loss": 1.7271, "step": 587 }, { "epoch": 0.15, "grad_norm": 0.27203970995110843, "learning_rate": 8.51031321619557e-05, "loss": 2.1576, "step": 588 }, { "epoch": 0.15, "grad_norm": 0.2760773611682643, "learning_rate": 8.507766743060861e-05, "loss": 2.1948, "step": 589 }, { "epoch": 0.15, "grad_norm": 0.26657512379993753, "learning_rate": 8.505220269926153e-05, "loss": 1.952, "step": 590 }, { "epoch": 0.15, "grad_norm": 0.22449921933656483, "learning_rate": 8.502673796791443e-05, "loss": 1.8616, "step": 591 }, { "epoch": 0.15, "grad_norm": 0.2269948090503817, "learning_rate": 8.500127323656736e-05, "loss": 1.7622, "step": 592 }, { "epoch": 0.15, "grad_norm": 0.225806033942239, "learning_rate": 8.497580850522027e-05, "loss": 1.679, "step": 593 }, { "epoch": 0.15, "grad_norm": 0.27226095482913315, "learning_rate": 8.495034377387318e-05, "loss": 2.2212, "step": 594 }, { "epoch": 0.15, "grad_norm": 0.25834770713831146, "learning_rate": 8.492487904252611e-05, "loss": 1.905, "step": 595 }, { "epoch": 0.15, "grad_norm": 0.22415664490617518, "learning_rate": 8.489941431117902e-05, "loss": 1.9853, "step": 596 }, { "epoch": 0.15, "grad_norm": 0.2614588372097758, "learning_rate": 8.487394957983193e-05, "loss": 1.7831, "step": 597 }, { "epoch": 0.15, "grad_norm": 0.2513907906833273, "learning_rate": 8.484848484848486e-05, "loss": 2.1041, "step": 598 }, { "epoch": 0.15, "grad_norm": 0.21971733073868202, "learning_rate": 8.482302011713776e-05, "loss": 1.6445, "step": 599 }, { "epoch": 0.15, "grad_norm": 0.2197734189405905, "learning_rate": 8.479755538579068e-05, "loss": 1.5833, "step": 600 }, { "epoch": 0.15, "grad_norm": 0.26087178356434226, "learning_rate": 8.477209065444361e-05, "loss": 1.8881, "step": 601 }, { "epoch": 0.15, "grad_norm": 0.27040403833766596, "learning_rate": 8.474662592309651e-05, "loss": 1.9293, "step": 602 }, { "epoch": 0.15, "grad_norm": 0.2602180613727054, "learning_rate": 8.472116119174943e-05, "loss": 2.1294, "step": 603 }, { "epoch": 0.15, "grad_norm": 0.2234191039988877, "learning_rate": 8.469569646040236e-05, "loss": 1.7762, "step": 604 }, { "epoch": 0.15, "grad_norm": 0.3153772694403617, "learning_rate": 8.467023172905526e-05, "loss": 1.9273, "step": 605 }, { "epoch": 0.15, "grad_norm": 0.23455438107172347, "learning_rate": 8.464476699770818e-05, "loss": 1.9544, "step": 606 }, { "epoch": 0.15, "grad_norm": 0.2658116862672306, "learning_rate": 8.46193022663611e-05, "loss": 2.2271, "step": 607 }, { "epoch": 0.15, "grad_norm": 0.32975917281160555, "learning_rate": 8.4593837535014e-05, "loss": 1.7337, "step": 608 }, { "epoch": 0.16, "grad_norm": 0.2228088608801704, "learning_rate": 8.456837280366693e-05, "loss": 2.0129, "step": 609 }, { "epoch": 0.16, "grad_norm": 0.26447431178004766, "learning_rate": 8.454290807231983e-05, "loss": 1.9728, "step": 610 }, { "epoch": 0.16, "grad_norm": 0.21976201089381797, "learning_rate": 8.451744334097276e-05, "loss": 1.8064, "step": 611 }, { "epoch": 0.16, "grad_norm": 0.3072809470726187, "learning_rate": 8.449197860962568e-05, "loss": 1.835, "step": 612 }, { "epoch": 0.16, "grad_norm": 0.24577672914201923, "learning_rate": 8.446651387827858e-05, "loss": 1.9267, "step": 613 }, { "epoch": 0.16, "grad_norm": 0.2236691741034138, "learning_rate": 8.44410491469315e-05, "loss": 1.8683, "step": 614 }, { "epoch": 0.16, "grad_norm": 0.23482479010040266, "learning_rate": 8.441558441558442e-05, "loss": 2.0112, "step": 615 }, { "epoch": 0.16, "grad_norm": 0.25066078299216493, "learning_rate": 8.439011968423733e-05, "loss": 1.835, "step": 616 }, { "epoch": 0.16, "grad_norm": 0.264604515569423, "learning_rate": 8.436465495289026e-05, "loss": 1.9542, "step": 617 }, { "epoch": 0.16, "grad_norm": 0.26229058609109396, "learning_rate": 8.433919022154317e-05, "loss": 1.8818, "step": 618 }, { "epoch": 0.16, "grad_norm": 0.2535097988499568, "learning_rate": 8.431372549019608e-05, "loss": 1.8894, "step": 619 }, { "epoch": 0.16, "grad_norm": 0.23424679110516414, "learning_rate": 8.4288260758849e-05, "loss": 1.6131, "step": 620 }, { "epoch": 0.16, "grad_norm": 0.26691813415164034, "learning_rate": 8.426279602750192e-05, "loss": 2.029, "step": 621 }, { "epoch": 0.16, "grad_norm": 0.22172670243930442, "learning_rate": 8.423733129615483e-05, "loss": 1.6191, "step": 622 }, { "epoch": 0.16, "grad_norm": 0.27667095622410315, "learning_rate": 8.421186656480774e-05, "loss": 2.2586, "step": 623 }, { "epoch": 0.16, "grad_norm": 0.301753010140633, "learning_rate": 8.418640183346065e-05, "loss": 1.9892, "step": 624 }, { "epoch": 0.16, "grad_norm": 0.1897230388556837, "learning_rate": 8.416093710211358e-05, "loss": 1.7812, "step": 625 }, { "epoch": 0.16, "grad_norm": 0.24970974314637662, "learning_rate": 8.413547237076649e-05, "loss": 2.0589, "step": 626 }, { "epoch": 0.16, "grad_norm": 0.3032015856119192, "learning_rate": 8.41100076394194e-05, "loss": 2.2445, "step": 627 }, { "epoch": 0.16, "grad_norm": 0.2108389659614189, "learning_rate": 8.408454290807233e-05, "loss": 1.6547, "step": 628 }, { "epoch": 0.16, "grad_norm": 0.33296527758702654, "learning_rate": 8.405907817672524e-05, "loss": 2.0687, "step": 629 }, { "epoch": 0.16, "grad_norm": 0.25872450689146614, "learning_rate": 8.403361344537815e-05, "loss": 2.1157, "step": 630 }, { "epoch": 0.16, "grad_norm": 0.24882848251532677, "learning_rate": 8.400814871403108e-05, "loss": 2.1495, "step": 631 }, { "epoch": 0.16, "grad_norm": 0.24377256496837985, "learning_rate": 8.398268398268399e-05, "loss": 1.6708, "step": 632 }, { "epoch": 0.16, "grad_norm": 0.27653964427752414, "learning_rate": 8.39572192513369e-05, "loss": 2.0369, "step": 633 }, { "epoch": 0.16, "grad_norm": 0.23087998248198688, "learning_rate": 8.393175451998982e-05, "loss": 1.9132, "step": 634 }, { "epoch": 0.16, "grad_norm": 0.2343100298794321, "learning_rate": 8.390628978864273e-05, "loss": 2.0154, "step": 635 }, { "epoch": 0.16, "grad_norm": 0.27080108224361626, "learning_rate": 8.388082505729565e-05, "loss": 1.9704, "step": 636 }, { "epoch": 0.16, "grad_norm": 0.23709343506276895, "learning_rate": 8.385536032594857e-05, "loss": 1.9831, "step": 637 }, { "epoch": 0.16, "grad_norm": 0.2233678922733808, "learning_rate": 8.382989559460148e-05, "loss": 1.5794, "step": 638 }, { "epoch": 0.16, "grad_norm": 0.22254628259973275, "learning_rate": 8.38044308632544e-05, "loss": 1.9141, "step": 639 }, { "epoch": 0.16, "grad_norm": 0.22679648977175332, "learning_rate": 8.377896613190732e-05, "loss": 1.869, "step": 640 }, { "epoch": 0.16, "grad_norm": 0.24650163756314084, "learning_rate": 8.375350140056023e-05, "loss": 1.975, "step": 641 }, { "epoch": 0.16, "grad_norm": 0.2356487471404544, "learning_rate": 8.372803666921314e-05, "loss": 2.0583, "step": 642 }, { "epoch": 0.16, "grad_norm": 0.23967999170522092, "learning_rate": 8.370257193786606e-05, "loss": 1.7817, "step": 643 }, { "epoch": 0.16, "grad_norm": 0.20266040525683782, "learning_rate": 8.367710720651898e-05, "loss": 1.7697, "step": 644 }, { "epoch": 0.16, "grad_norm": 0.2594234966991798, "learning_rate": 8.365164247517189e-05, "loss": 1.8042, "step": 645 }, { "epoch": 0.16, "grad_norm": 0.2562563488336597, "learning_rate": 8.36261777438248e-05, "loss": 1.9621, "step": 646 }, { "epoch": 0.16, "grad_norm": 0.27401385191984207, "learning_rate": 8.360071301247773e-05, "loss": 1.9699, "step": 647 }, { "epoch": 0.16, "grad_norm": 0.2675310914964076, "learning_rate": 8.357524828113064e-05, "loss": 1.8894, "step": 648 }, { "epoch": 0.17, "grad_norm": 0.2242659555646828, "learning_rate": 8.354978354978355e-05, "loss": 1.8756, "step": 649 }, { "epoch": 0.17, "grad_norm": 0.254521578001409, "learning_rate": 8.352431881843646e-05, "loss": 1.8776, "step": 650 }, { "epoch": 0.17, "grad_norm": 0.24220297364056145, "learning_rate": 8.349885408708939e-05, "loss": 1.887, "step": 651 }, { "epoch": 0.17, "grad_norm": 0.2440556645382515, "learning_rate": 8.34733893557423e-05, "loss": 1.7679, "step": 652 }, { "epoch": 0.17, "grad_norm": 0.22591067139033177, "learning_rate": 8.344792462439521e-05, "loss": 1.9008, "step": 653 }, { "epoch": 0.17, "grad_norm": 0.3398909700407971, "learning_rate": 8.342245989304814e-05, "loss": 1.8229, "step": 654 }, { "epoch": 0.17, "grad_norm": 0.24230125278447373, "learning_rate": 8.339699516170105e-05, "loss": 1.7633, "step": 655 }, { "epoch": 0.17, "grad_norm": 0.20022248804750795, "learning_rate": 8.337153043035396e-05, "loss": 1.6237, "step": 656 }, { "epoch": 0.17, "grad_norm": 0.25412426905003854, "learning_rate": 8.334606569900687e-05, "loss": 1.8433, "step": 657 }, { "epoch": 0.17, "grad_norm": 0.22552341569093054, "learning_rate": 8.332060096765979e-05, "loss": 1.7831, "step": 658 }, { "epoch": 0.17, "grad_norm": 0.22664716388325745, "learning_rate": 8.329513623631271e-05, "loss": 1.7294, "step": 659 }, { "epoch": 0.17, "grad_norm": 0.25906233843900756, "learning_rate": 8.326967150496562e-05, "loss": 1.8616, "step": 660 }, { "epoch": 0.17, "grad_norm": 0.3111900770192149, "learning_rate": 8.324420677361854e-05, "loss": 2.0473, "step": 661 }, { "epoch": 0.17, "grad_norm": 0.24845573190451736, "learning_rate": 8.321874204227146e-05, "loss": 2.0842, "step": 662 }, { "epoch": 0.17, "grad_norm": 0.2721528256093899, "learning_rate": 8.319327731092437e-05, "loss": 2.01, "step": 663 }, { "epoch": 0.17, "grad_norm": 0.29973802850597847, "learning_rate": 8.316781257957729e-05, "loss": 2.0272, "step": 664 }, { "epoch": 0.17, "grad_norm": 0.22714114288136683, "learning_rate": 8.314234784823021e-05, "loss": 1.7836, "step": 665 }, { "epoch": 0.17, "grad_norm": 0.23256989582074034, "learning_rate": 8.311688311688312e-05, "loss": 1.6346, "step": 666 }, { "epoch": 0.17, "grad_norm": 0.23821437397510137, "learning_rate": 8.309141838553604e-05, "loss": 1.7571, "step": 667 }, { "epoch": 0.17, "grad_norm": 0.2412285199880779, "learning_rate": 8.306595365418896e-05, "loss": 2.0028, "step": 668 }, { "epoch": 0.17, "grad_norm": 0.26433168437428606, "learning_rate": 8.304048892284186e-05, "loss": 1.7625, "step": 669 }, { "epoch": 0.17, "grad_norm": 0.2744294023129235, "learning_rate": 8.301502419149479e-05, "loss": 2.0854, "step": 670 }, { "epoch": 0.17, "grad_norm": 0.26885424466275476, "learning_rate": 8.29895594601477e-05, "loss": 1.9209, "step": 671 }, { "epoch": 0.17, "grad_norm": 0.2709272142457605, "learning_rate": 8.296409472880061e-05, "loss": 2.1129, "step": 672 }, { "epoch": 0.17, "grad_norm": 0.23999185648900634, "learning_rate": 8.293862999745354e-05, "loss": 1.6473, "step": 673 }, { "epoch": 0.17, "grad_norm": 0.25770608555351354, "learning_rate": 8.291316526610645e-05, "loss": 1.7928, "step": 674 }, { "epoch": 0.17, "grad_norm": 0.2537587397578314, "learning_rate": 8.288770053475936e-05, "loss": 1.8933, "step": 675 }, { "epoch": 0.17, "grad_norm": 0.23704349601861344, "learning_rate": 8.286223580341229e-05, "loss": 1.8767, "step": 676 }, { "epoch": 0.17, "grad_norm": 0.2513192208858804, "learning_rate": 8.283677107206518e-05, "loss": 1.7624, "step": 677 }, { "epoch": 0.17, "grad_norm": 0.2537437065405792, "learning_rate": 8.281130634071811e-05, "loss": 1.9107, "step": 678 }, { "epoch": 0.17, "grad_norm": 0.25124285630845544, "learning_rate": 8.278584160937104e-05, "loss": 2.2631, "step": 679 }, { "epoch": 0.17, "grad_norm": 0.2830311483235791, "learning_rate": 8.276037687802393e-05, "loss": 1.9933, "step": 680 }, { "epoch": 0.17, "grad_norm": 0.22526784656744517, "learning_rate": 8.273491214667686e-05, "loss": 1.745, "step": 681 }, { "epoch": 0.17, "grad_norm": 0.20692988379313565, "learning_rate": 8.270944741532977e-05, "loss": 1.8224, "step": 682 }, { "epoch": 0.17, "grad_norm": 0.3125523880915381, "learning_rate": 8.268398268398268e-05, "loss": 2.1634, "step": 683 }, { "epoch": 0.17, "grad_norm": 0.26933932727564425, "learning_rate": 8.265851795263561e-05, "loss": 2.0392, "step": 684 }, { "epoch": 0.17, "grad_norm": 0.2341672214756589, "learning_rate": 8.263305322128852e-05, "loss": 1.8573, "step": 685 }, { "epoch": 0.17, "grad_norm": 0.24637431847948854, "learning_rate": 8.260758848994143e-05, "loss": 1.7781, "step": 686 }, { "epoch": 0.17, "grad_norm": 0.27247735797800854, "learning_rate": 8.258212375859436e-05, "loss": 2.0406, "step": 687 }, { "epoch": 0.18, "grad_norm": 0.2509004499748774, "learning_rate": 8.255665902724726e-05, "loss": 2.0022, "step": 688 }, { "epoch": 0.18, "grad_norm": 0.2490574233855169, "learning_rate": 8.253119429590018e-05, "loss": 1.9605, "step": 689 }, { "epoch": 0.18, "grad_norm": 0.21044261594535146, "learning_rate": 8.250572956455311e-05, "loss": 1.8126, "step": 690 }, { "epoch": 0.18, "grad_norm": 0.2536984840725562, "learning_rate": 8.248026483320601e-05, "loss": 1.9673, "step": 691 }, { "epoch": 0.18, "grad_norm": 0.20622765029844037, "learning_rate": 8.245480010185893e-05, "loss": 1.7606, "step": 692 }, { "epoch": 0.18, "grad_norm": 0.2542420147904985, "learning_rate": 8.242933537051184e-05, "loss": 2.0577, "step": 693 }, { "epoch": 0.18, "grad_norm": 0.26315616047124607, "learning_rate": 8.240387063916476e-05, "loss": 1.9809, "step": 694 }, { "epoch": 0.18, "grad_norm": 0.22211583961688874, "learning_rate": 8.237840590781768e-05, "loss": 1.8249, "step": 695 }, { "epoch": 0.18, "grad_norm": 0.24345249090692983, "learning_rate": 8.23529411764706e-05, "loss": 2.1196, "step": 696 }, { "epoch": 0.18, "grad_norm": 0.2437022889955659, "learning_rate": 8.23274764451235e-05, "loss": 2.2124, "step": 697 }, { "epoch": 0.18, "grad_norm": 0.21587595502844253, "learning_rate": 8.230201171377643e-05, "loss": 1.7082, "step": 698 }, { "epoch": 0.18, "grad_norm": 0.19780382068027816, "learning_rate": 8.227654698242933e-05, "loss": 1.6877, "step": 699 }, { "epoch": 0.18, "grad_norm": 0.2606709700917808, "learning_rate": 8.225108225108226e-05, "loss": 1.6861, "step": 700 }, { "epoch": 0.18, "grad_norm": 0.22780511715912988, "learning_rate": 8.222561751973517e-05, "loss": 1.8206, "step": 701 }, { "epoch": 0.18, "grad_norm": 0.21568730083147977, "learning_rate": 8.220015278838808e-05, "loss": 1.7652, "step": 702 }, { "epoch": 0.18, "grad_norm": 0.23477539194517308, "learning_rate": 8.2174688057041e-05, "loss": 1.8679, "step": 703 }, { "epoch": 0.18, "grad_norm": 0.2474139165125321, "learning_rate": 8.214922332569392e-05, "loss": 1.7812, "step": 704 }, { "epoch": 0.18, "grad_norm": 0.21314623125660764, "learning_rate": 8.212375859434683e-05, "loss": 1.7575, "step": 705 }, { "epoch": 0.18, "grad_norm": 0.2562433957202535, "learning_rate": 8.209829386299976e-05, "loss": 1.9478, "step": 706 }, { "epoch": 0.18, "grad_norm": 0.29203077845895087, "learning_rate": 8.207282913165267e-05, "loss": 2.145, "step": 707 }, { "epoch": 0.18, "grad_norm": 0.23551961035637994, "learning_rate": 8.204736440030558e-05, "loss": 2.0507, "step": 708 }, { "epoch": 0.18, "grad_norm": 0.2874919133500298, "learning_rate": 8.20218996689585e-05, "loss": 1.8884, "step": 709 }, { "epoch": 0.18, "grad_norm": 0.2435714419184239, "learning_rate": 8.19964349376114e-05, "loss": 1.7244, "step": 710 }, { "epoch": 0.18, "grad_norm": 0.25461303933038876, "learning_rate": 8.197097020626433e-05, "loss": 1.8317, "step": 711 }, { "epoch": 0.18, "grad_norm": 0.3174139389033714, "learning_rate": 8.194550547491724e-05, "loss": 2.4544, "step": 712 }, { "epoch": 0.18, "grad_norm": 0.260350536850943, "learning_rate": 8.192004074357015e-05, "loss": 1.7694, "step": 713 }, { "epoch": 0.18, "grad_norm": 0.26749580397835515, "learning_rate": 8.189457601222308e-05, "loss": 1.8921, "step": 714 }, { "epoch": 0.18, "grad_norm": 0.21515415482896466, "learning_rate": 8.186911128087599e-05, "loss": 1.6123, "step": 715 }, { "epoch": 0.18, "grad_norm": 0.2814437675566034, "learning_rate": 8.18436465495289e-05, "loss": 1.6636, "step": 716 }, { "epoch": 0.18, "grad_norm": 0.25757055778830756, "learning_rate": 8.181818181818183e-05, "loss": 2.0113, "step": 717 }, { "epoch": 0.18, "grad_norm": 0.26431323567719006, "learning_rate": 8.179271708683474e-05, "loss": 1.9948, "step": 718 }, { "epoch": 0.18, "grad_norm": 0.264939728509053, "learning_rate": 8.176725235548765e-05, "loss": 1.823, "step": 719 }, { "epoch": 0.18, "grad_norm": 0.22863864847566018, "learning_rate": 8.174178762414057e-05, "loss": 1.7145, "step": 720 }, { "epoch": 0.18, "grad_norm": 0.2926476308161338, "learning_rate": 8.171632289279348e-05, "loss": 1.9253, "step": 721 }, { "epoch": 0.18, "grad_norm": 0.2567930149430035, "learning_rate": 8.16908581614464e-05, "loss": 1.9469, "step": 722 }, { "epoch": 0.18, "grad_norm": 0.21388350820757424, "learning_rate": 8.166539343009932e-05, "loss": 1.7821, "step": 723 }, { "epoch": 0.18, "grad_norm": 0.2390070476919383, "learning_rate": 8.163992869875223e-05, "loss": 1.7102, "step": 724 }, { "epoch": 0.18, "grad_norm": 0.24249325187378476, "learning_rate": 8.161446396740515e-05, "loss": 1.8183, "step": 725 }, { "epoch": 0.18, "grad_norm": 0.22973235529222133, "learning_rate": 8.158899923605807e-05, "loss": 1.8443, "step": 726 }, { "epoch": 0.19, "grad_norm": 0.2278244128468002, "learning_rate": 8.156353450471098e-05, "loss": 1.8287, "step": 727 }, { "epoch": 0.19, "grad_norm": 0.21975864600021108, "learning_rate": 8.153806977336389e-05, "loss": 1.7561, "step": 728 }, { "epoch": 0.19, "grad_norm": 0.24673932789949182, "learning_rate": 8.151260504201682e-05, "loss": 1.8798, "step": 729 }, { "epoch": 0.19, "grad_norm": 0.23689722471477442, "learning_rate": 8.148714031066973e-05, "loss": 1.9101, "step": 730 }, { "epoch": 0.19, "grad_norm": 0.23534223925853134, "learning_rate": 8.146167557932264e-05, "loss": 2.0234, "step": 731 }, { "epoch": 0.19, "grad_norm": 0.23786358891087073, "learning_rate": 8.143621084797556e-05, "loss": 1.8746, "step": 732 }, { "epoch": 0.19, "grad_norm": 0.2531245449438832, "learning_rate": 8.141074611662848e-05, "loss": 1.7922, "step": 733 }, { "epoch": 0.19, "grad_norm": 0.28806956616452595, "learning_rate": 8.138528138528139e-05, "loss": 1.9351, "step": 734 }, { "epoch": 0.19, "grad_norm": 0.2245345944850363, "learning_rate": 8.13598166539343e-05, "loss": 1.7899, "step": 735 }, { "epoch": 0.19, "grad_norm": 0.262884773654068, "learning_rate": 8.133435192258721e-05, "loss": 1.9936, "step": 736 }, { "epoch": 0.19, "grad_norm": 0.2660954600675136, "learning_rate": 8.130888719124014e-05, "loss": 1.9235, "step": 737 }, { "epoch": 0.19, "grad_norm": 0.2544210936284766, "learning_rate": 8.128342245989305e-05, "loss": 2.0572, "step": 738 }, { "epoch": 0.19, "grad_norm": 0.21115268222074945, "learning_rate": 8.125795772854596e-05, "loss": 1.6713, "step": 739 }, { "epoch": 0.19, "grad_norm": 0.23399606907937723, "learning_rate": 8.123249299719889e-05, "loss": 1.7741, "step": 740 }, { "epoch": 0.19, "grad_norm": 0.2413301829518615, "learning_rate": 8.12070282658518e-05, "loss": 1.617, "step": 741 }, { "epoch": 0.19, "grad_norm": 0.24363345300377096, "learning_rate": 8.118156353450471e-05, "loss": 1.8219, "step": 742 }, { "epoch": 0.19, "grad_norm": 0.21322552997622987, "learning_rate": 8.115609880315764e-05, "loss": 1.8923, "step": 743 }, { "epoch": 0.19, "grad_norm": 0.2648814959339343, "learning_rate": 8.113063407181055e-05, "loss": 1.8955, "step": 744 }, { "epoch": 0.19, "grad_norm": 0.24604419936013278, "learning_rate": 8.110516934046346e-05, "loss": 1.7781, "step": 745 }, { "epoch": 0.19, "grad_norm": 0.31400117322830134, "learning_rate": 8.107970460911637e-05, "loss": 2.0413, "step": 746 }, { "epoch": 0.19, "grad_norm": 0.23840782055804374, "learning_rate": 8.105423987776929e-05, "loss": 1.8463, "step": 747 }, { "epoch": 0.19, "grad_norm": 0.2444438272270324, "learning_rate": 8.102877514642221e-05, "loss": 1.7236, "step": 748 }, { "epoch": 0.19, "grad_norm": 0.2796474737157944, "learning_rate": 8.100331041507512e-05, "loss": 2.1684, "step": 749 }, { "epoch": 0.19, "grad_norm": 0.2133773175143318, "learning_rate": 8.097784568372804e-05, "loss": 1.707, "step": 750 }, { "epoch": 0.19, "grad_norm": 0.2839241058327468, "learning_rate": 8.095238095238096e-05, "loss": 1.8488, "step": 751 }, { "epoch": 0.19, "grad_norm": 0.2177899957357643, "learning_rate": 8.092691622103387e-05, "loss": 1.5671, "step": 752 }, { "epoch": 0.19, "grad_norm": 0.28396641702030523, "learning_rate": 8.090145148968679e-05, "loss": 1.9648, "step": 753 }, { "epoch": 0.19, "grad_norm": 0.22902053902743993, "learning_rate": 8.087598675833971e-05, "loss": 1.8343, "step": 754 }, { "epoch": 0.19, "grad_norm": 0.270526183017562, "learning_rate": 8.085052202699261e-05, "loss": 2.0001, "step": 755 }, { "epoch": 0.19, "grad_norm": 0.2427759405177403, "learning_rate": 8.082505729564554e-05, "loss": 1.8584, "step": 756 }, { "epoch": 0.19, "grad_norm": 0.20730834087964428, "learning_rate": 8.079959256429845e-05, "loss": 1.8038, "step": 757 }, { "epoch": 0.19, "grad_norm": 0.23676530738979948, "learning_rate": 8.077412783295136e-05, "loss": 1.8173, "step": 758 }, { "epoch": 0.19, "grad_norm": 0.27498343645411766, "learning_rate": 8.074866310160429e-05, "loss": 2.1532, "step": 759 }, { "epoch": 0.19, "grad_norm": 0.30573909192110055, "learning_rate": 8.07231983702572e-05, "loss": 1.7967, "step": 760 }, { "epoch": 0.19, "grad_norm": 0.25874172713465876, "learning_rate": 8.069773363891011e-05, "loss": 1.9587, "step": 761 }, { "epoch": 0.19, "grad_norm": 0.23840994564446671, "learning_rate": 8.067226890756304e-05, "loss": 1.8361, "step": 762 }, { "epoch": 0.19, "grad_norm": 0.20338547035408697, "learning_rate": 8.064680417621593e-05, "loss": 1.8748, "step": 763 }, { "epoch": 0.19, "grad_norm": 0.22107511583394449, "learning_rate": 8.062133944486886e-05, "loss": 1.7538, "step": 764 }, { "epoch": 0.19, "grad_norm": 0.23126567033564033, "learning_rate": 8.059587471352179e-05, "loss": 1.7758, "step": 765 }, { "epoch": 0.19, "grad_norm": 0.2623222140707853, "learning_rate": 8.057040998217468e-05, "loss": 2.0597, "step": 766 }, { "epoch": 0.2, "grad_norm": 0.2645590655894485, "learning_rate": 8.054494525082761e-05, "loss": 1.8437, "step": 767 }, { "epoch": 0.2, "grad_norm": 0.2017017811080249, "learning_rate": 8.051948051948052e-05, "loss": 1.7988, "step": 768 }, { "epoch": 0.2, "grad_norm": 0.23507420829129272, "learning_rate": 8.049401578813343e-05, "loss": 1.9862, "step": 769 }, { "epoch": 0.2, "grad_norm": 0.26716449538128145, "learning_rate": 8.046855105678636e-05, "loss": 1.6237, "step": 770 }, { "epoch": 0.2, "grad_norm": 0.37869065827033116, "learning_rate": 8.044308632543927e-05, "loss": 1.8673, "step": 771 }, { "epoch": 0.2, "grad_norm": 0.23080535327611468, "learning_rate": 8.041762159409218e-05, "loss": 1.7489, "step": 772 }, { "epoch": 0.2, "grad_norm": 0.25052884909217044, "learning_rate": 8.039215686274511e-05, "loss": 1.9734, "step": 773 }, { "epoch": 0.2, "grad_norm": 0.20729400047988472, "learning_rate": 8.036669213139801e-05, "loss": 1.6804, "step": 774 }, { "epoch": 0.2, "grad_norm": 0.24687693118293738, "learning_rate": 8.034122740005093e-05, "loss": 2.0605, "step": 775 }, { "epoch": 0.2, "grad_norm": 0.24239256083813954, "learning_rate": 8.031576266870386e-05, "loss": 1.818, "step": 776 }, { "epoch": 0.2, "grad_norm": 0.2385267259605302, "learning_rate": 8.029029793735676e-05, "loss": 1.9034, "step": 777 }, { "epoch": 0.2, "grad_norm": 0.2326704907462259, "learning_rate": 8.026483320600968e-05, "loss": 1.7703, "step": 778 }, { "epoch": 0.2, "grad_norm": 0.26165915992218536, "learning_rate": 8.02393684746626e-05, "loss": 2.0577, "step": 779 }, { "epoch": 0.2, "grad_norm": 0.19764839465849784, "learning_rate": 8.021390374331551e-05, "loss": 1.954, "step": 780 }, { "epoch": 0.2, "grad_norm": 0.22493088144049952, "learning_rate": 8.018843901196843e-05, "loss": 2.1367, "step": 781 }, { "epoch": 0.2, "grad_norm": 0.2341060764533811, "learning_rate": 8.016297428062134e-05, "loss": 1.5133, "step": 782 }, { "epoch": 0.2, "grad_norm": 0.38908282077529777, "learning_rate": 8.013750954927426e-05, "loss": 1.8469, "step": 783 }, { "epoch": 0.2, "grad_norm": 0.25293407657305794, "learning_rate": 8.011204481792718e-05, "loss": 1.8941, "step": 784 }, { "epoch": 0.2, "grad_norm": 0.3140019918389227, "learning_rate": 8.008658008658008e-05, "loss": 1.6617, "step": 785 }, { "epoch": 0.2, "grad_norm": 0.2559226154475287, "learning_rate": 8.0061115355233e-05, "loss": 1.6582, "step": 786 }, { "epoch": 0.2, "grad_norm": 0.26485367590384346, "learning_rate": 8.003565062388593e-05, "loss": 1.9837, "step": 787 }, { "epoch": 0.2, "grad_norm": 0.25715533221066317, "learning_rate": 8.001018589253883e-05, "loss": 1.6776, "step": 788 }, { "epoch": 0.2, "grad_norm": 0.27671433607358126, "learning_rate": 7.998472116119176e-05, "loss": 2.0062, "step": 789 }, { "epoch": 0.2, "grad_norm": 0.2609719967453421, "learning_rate": 7.995925642984467e-05, "loss": 2.0136, "step": 790 }, { "epoch": 0.2, "grad_norm": 0.26557595136731144, "learning_rate": 7.993379169849758e-05, "loss": 1.9437, "step": 791 }, { "epoch": 0.2, "grad_norm": 0.2711309423960604, "learning_rate": 7.99083269671505e-05, "loss": 1.9532, "step": 792 }, { "epoch": 0.2, "grad_norm": 0.2546122638764411, "learning_rate": 7.988286223580342e-05, "loss": 1.8557, "step": 793 }, { "epoch": 0.2, "grad_norm": 0.2632866906982076, "learning_rate": 7.985739750445633e-05, "loss": 1.935, "step": 794 }, { "epoch": 0.2, "grad_norm": 0.28892742042280245, "learning_rate": 7.983193277310926e-05, "loss": 2.0196, "step": 795 }, { "epoch": 0.2, "grad_norm": 0.2155105819372084, "learning_rate": 7.980646804176215e-05, "loss": 1.5858, "step": 796 }, { "epoch": 0.2, "grad_norm": 0.23811179529959206, "learning_rate": 7.978100331041508e-05, "loss": 1.9028, "step": 797 }, { "epoch": 0.2, "grad_norm": 0.22173199477634228, "learning_rate": 7.975553857906799e-05, "loss": 1.874, "step": 798 }, { "epoch": 0.2, "grad_norm": 0.2670009960404223, "learning_rate": 7.97300738477209e-05, "loss": 1.8718, "step": 799 }, { "epoch": 0.2, "grad_norm": 0.24864604252498942, "learning_rate": 7.970460911637383e-05, "loss": 2.1092, "step": 800 }, { "epoch": 0.2, "grad_norm": 0.22167578874428115, "learning_rate": 7.967914438502674e-05, "loss": 1.7823, "step": 801 }, { "epoch": 0.2, "grad_norm": 0.2647763467499256, "learning_rate": 7.965367965367965e-05, "loss": 1.8657, "step": 802 }, { "epoch": 0.2, "grad_norm": 0.32196599729557707, "learning_rate": 7.962821492233258e-05, "loss": 1.9988, "step": 803 }, { "epoch": 0.2, "grad_norm": 0.24083791298757853, "learning_rate": 7.960275019098549e-05, "loss": 1.6123, "step": 804 }, { "epoch": 0.2, "grad_norm": 0.2793392736091423, "learning_rate": 7.95772854596384e-05, "loss": 1.9157, "step": 805 }, { "epoch": 0.21, "grad_norm": 0.23601069294013594, "learning_rate": 7.955182072829132e-05, "loss": 1.8695, "step": 806 }, { "epoch": 0.21, "grad_norm": 0.2372446337405536, "learning_rate": 7.952635599694424e-05, "loss": 1.9903, "step": 807 }, { "epoch": 0.21, "grad_norm": 0.2515041727957943, "learning_rate": 7.950089126559715e-05, "loss": 1.9621, "step": 808 }, { "epoch": 0.21, "grad_norm": 0.2531873061180079, "learning_rate": 7.947542653425007e-05, "loss": 1.9211, "step": 809 }, { "epoch": 0.21, "grad_norm": 0.21761360632504378, "learning_rate": 7.944996180290298e-05, "loss": 1.5598, "step": 810 }, { "epoch": 0.21, "grad_norm": 0.22808846056954318, "learning_rate": 7.94244970715559e-05, "loss": 1.7554, "step": 811 }, { "epoch": 0.21, "grad_norm": 0.23077816317385094, "learning_rate": 7.939903234020882e-05, "loss": 1.5468, "step": 812 }, { "epoch": 0.21, "grad_norm": 0.2215089182231589, "learning_rate": 7.937356760886173e-05, "loss": 1.652, "step": 813 }, { "epoch": 0.21, "grad_norm": 0.23311367779561595, "learning_rate": 7.934810287751464e-05, "loss": 1.7622, "step": 814 }, { "epoch": 0.21, "grad_norm": 0.2955128136695084, "learning_rate": 7.932263814616757e-05, "loss": 2.0124, "step": 815 }, { "epoch": 0.21, "grad_norm": 0.25773817493774576, "learning_rate": 7.929717341482048e-05, "loss": 2.0789, "step": 816 }, { "epoch": 0.21, "grad_norm": 0.34237355221347887, "learning_rate": 7.927170868347339e-05, "loss": 2.0829, "step": 817 }, { "epoch": 0.21, "grad_norm": 0.28839753966260834, "learning_rate": 7.924624395212632e-05, "loss": 1.9994, "step": 818 }, { "epoch": 0.21, "grad_norm": 0.21212286265223146, "learning_rate": 7.922077922077923e-05, "loss": 1.7568, "step": 819 }, { "epoch": 0.21, "grad_norm": 0.2551081892350359, "learning_rate": 7.919531448943214e-05, "loss": 1.8875, "step": 820 }, { "epoch": 0.21, "grad_norm": 0.25490814024680464, "learning_rate": 7.916984975808505e-05, "loss": 1.8609, "step": 821 }, { "epoch": 0.21, "grad_norm": 0.2462200037724237, "learning_rate": 7.914438502673798e-05, "loss": 1.8756, "step": 822 }, { "epoch": 0.21, "grad_norm": 0.2495452947850181, "learning_rate": 7.911892029539089e-05, "loss": 1.987, "step": 823 }, { "epoch": 0.21, "grad_norm": 0.20413445933126373, "learning_rate": 7.90934555640438e-05, "loss": 1.5711, "step": 824 }, { "epoch": 0.21, "grad_norm": 0.2280167080933151, "learning_rate": 7.906799083269671e-05, "loss": 1.793, "step": 825 }, { "epoch": 0.21, "grad_norm": 0.287282709750182, "learning_rate": 7.904252610134964e-05, "loss": 1.904, "step": 826 }, { "epoch": 0.21, "grad_norm": 0.23544746439409528, "learning_rate": 7.901706137000255e-05, "loss": 1.7651, "step": 827 }, { "epoch": 0.21, "grad_norm": 0.2623434440061183, "learning_rate": 7.899159663865546e-05, "loss": 2.0431, "step": 828 }, { "epoch": 0.21, "grad_norm": 0.29813009277728214, "learning_rate": 7.896613190730839e-05, "loss": 2.17, "step": 829 }, { "epoch": 0.21, "grad_norm": 0.24173203062575552, "learning_rate": 7.89406671759613e-05, "loss": 1.8406, "step": 830 }, { "epoch": 0.21, "grad_norm": 0.2366699237638536, "learning_rate": 7.891520244461421e-05, "loss": 1.8798, "step": 831 }, { "epoch": 0.21, "grad_norm": 0.2797406812007206, "learning_rate": 7.888973771326712e-05, "loss": 1.9852, "step": 832 }, { "epoch": 0.21, "grad_norm": 0.2099481977989102, "learning_rate": 7.886427298192004e-05, "loss": 1.871, "step": 833 }, { "epoch": 0.21, "grad_norm": 0.22644585862553684, "learning_rate": 7.883880825057296e-05, "loss": 1.8052, "step": 834 }, { "epoch": 0.21, "grad_norm": 0.24565428898932076, "learning_rate": 7.881334351922587e-05, "loss": 2.1414, "step": 835 }, { "epoch": 0.21, "grad_norm": 0.2989449989300256, "learning_rate": 7.878787878787879e-05, "loss": 2.0666, "step": 836 }, { "epoch": 0.21, "grad_norm": 0.24542240881854216, "learning_rate": 7.876241405653171e-05, "loss": 1.954, "step": 837 }, { "epoch": 0.21, "grad_norm": 0.2672418304886155, "learning_rate": 7.873694932518462e-05, "loss": 2.0811, "step": 838 }, { "epoch": 0.21, "grad_norm": 0.2628618442827805, "learning_rate": 7.871148459383754e-05, "loss": 1.9559, "step": 839 }, { "epoch": 0.21, "grad_norm": 0.25650753303623913, "learning_rate": 7.868601986249046e-05, "loss": 2.2492, "step": 840 }, { "epoch": 0.21, "grad_norm": 0.25022188413316765, "learning_rate": 7.866055513114336e-05, "loss": 2.0061, "step": 841 }, { "epoch": 0.21, "grad_norm": 0.2295783023375302, "learning_rate": 7.863509039979629e-05, "loss": 1.876, "step": 842 }, { "epoch": 0.21, "grad_norm": 0.3438795452503926, "learning_rate": 7.86096256684492e-05, "loss": 2.2212, "step": 843 }, { "epoch": 0.21, "grad_norm": 0.20534516160381372, "learning_rate": 7.858416093710211e-05, "loss": 1.8619, "step": 844 }, { "epoch": 0.22, "grad_norm": 0.22899939201151417, "learning_rate": 7.855869620575504e-05, "loss": 1.7904, "step": 845 }, { "epoch": 0.22, "grad_norm": 0.26192343948953833, "learning_rate": 7.853323147440795e-05, "loss": 1.8803, "step": 846 }, { "epoch": 0.22, "grad_norm": 0.2402411049000976, "learning_rate": 7.850776674306086e-05, "loss": 1.7429, "step": 847 }, { "epoch": 0.22, "grad_norm": 0.23155189023956613, "learning_rate": 7.848230201171379e-05, "loss": 1.7725, "step": 848 }, { "epoch": 0.22, "grad_norm": 0.24159559089848437, "learning_rate": 7.845683728036668e-05, "loss": 1.8425, "step": 849 }, { "epoch": 0.22, "grad_norm": 0.22127225800264405, "learning_rate": 7.843137254901961e-05, "loss": 1.6196, "step": 850 }, { "epoch": 0.22, "grad_norm": 0.2329639876433451, "learning_rate": 7.840590781767254e-05, "loss": 2.1553, "step": 851 }, { "epoch": 0.22, "grad_norm": 0.2841636941610758, "learning_rate": 7.838044308632543e-05, "loss": 2.0216, "step": 852 }, { "epoch": 0.22, "grad_norm": 0.23058339678000966, "learning_rate": 7.835497835497836e-05, "loss": 1.7883, "step": 853 }, { "epoch": 0.22, "grad_norm": 0.24237110500048858, "learning_rate": 7.832951362363129e-05, "loss": 1.8807, "step": 854 }, { "epoch": 0.22, "grad_norm": 0.24655649789483783, "learning_rate": 7.830404889228418e-05, "loss": 1.8507, "step": 855 }, { "epoch": 0.22, "grad_norm": 0.28644569397987674, "learning_rate": 7.827858416093711e-05, "loss": 2.051, "step": 856 }, { "epoch": 0.22, "grad_norm": 0.23892097117174657, "learning_rate": 7.825311942959002e-05, "loss": 1.8824, "step": 857 }, { "epoch": 0.22, "grad_norm": 0.24085696112468405, "learning_rate": 7.822765469824293e-05, "loss": 1.6903, "step": 858 }, { "epoch": 0.22, "grad_norm": 0.23898549055005605, "learning_rate": 7.820218996689586e-05, "loss": 1.8986, "step": 859 }, { "epoch": 0.22, "grad_norm": 0.23549339168027972, "learning_rate": 7.817672523554876e-05, "loss": 1.8983, "step": 860 }, { "epoch": 0.22, "grad_norm": 0.2747127224535722, "learning_rate": 7.815126050420168e-05, "loss": 2.2242, "step": 861 }, { "epoch": 0.22, "grad_norm": 0.2970101274849217, "learning_rate": 7.812579577285461e-05, "loss": 2.0758, "step": 862 }, { "epoch": 0.22, "grad_norm": 0.2472828339287725, "learning_rate": 7.810033104150751e-05, "loss": 1.7372, "step": 863 }, { "epoch": 0.22, "grad_norm": 0.2623491807640201, "learning_rate": 7.807486631016043e-05, "loss": 1.6525, "step": 864 }, { "epoch": 0.22, "grad_norm": 0.2441195059224979, "learning_rate": 7.804940157881336e-05, "loss": 1.6852, "step": 865 }, { "epoch": 0.22, "grad_norm": 0.2572716038458357, "learning_rate": 7.802393684746626e-05, "loss": 2.002, "step": 866 }, { "epoch": 0.22, "grad_norm": 0.19699396179900933, "learning_rate": 7.799847211611918e-05, "loss": 1.7233, "step": 867 }, { "epoch": 0.22, "grad_norm": 0.20002757232567733, "learning_rate": 7.79730073847721e-05, "loss": 1.7233, "step": 868 }, { "epoch": 0.22, "grad_norm": 0.25667728121646555, "learning_rate": 7.794754265342501e-05, "loss": 1.9715, "step": 869 }, { "epoch": 0.22, "grad_norm": 0.24319349057248757, "learning_rate": 7.792207792207793e-05, "loss": 1.7304, "step": 870 }, { "epoch": 0.22, "grad_norm": 0.24726911696006718, "learning_rate": 7.789661319073084e-05, "loss": 1.7162, "step": 871 }, { "epoch": 0.22, "grad_norm": 0.28918886000356936, "learning_rate": 7.787114845938376e-05, "loss": 1.9583, "step": 872 }, { "epoch": 0.22, "grad_norm": 0.2577005200960701, "learning_rate": 7.784568372803668e-05, "loss": 1.9514, "step": 873 }, { "epoch": 0.22, "grad_norm": 0.24211230956641286, "learning_rate": 7.782021899668958e-05, "loss": 2.0073, "step": 874 }, { "epoch": 0.22, "grad_norm": 0.2986523229538993, "learning_rate": 7.779475426534251e-05, "loss": 1.9991, "step": 875 }, { "epoch": 0.22, "grad_norm": 0.2378687101520814, "learning_rate": 7.776928953399542e-05, "loss": 1.6956, "step": 876 }, { "epoch": 0.22, "grad_norm": 0.2448411504294662, "learning_rate": 7.774382480264833e-05, "loss": 1.8645, "step": 877 }, { "epoch": 0.22, "grad_norm": 0.260326811865287, "learning_rate": 7.771836007130126e-05, "loss": 1.8771, "step": 878 }, { "epoch": 0.22, "grad_norm": 0.23711140143142848, "learning_rate": 7.769289533995417e-05, "loss": 1.7325, "step": 879 }, { "epoch": 0.22, "grad_norm": 0.25312316152116904, "learning_rate": 7.766743060860708e-05, "loss": 1.8855, "step": 880 }, { "epoch": 0.22, "grad_norm": 0.2590127374184677, "learning_rate": 7.764196587726e-05, "loss": 1.7209, "step": 881 }, { "epoch": 0.22, "grad_norm": 0.24535401746192478, "learning_rate": 7.761650114591292e-05, "loss": 1.9677, "step": 882 }, { "epoch": 0.22, "grad_norm": 0.2612033865607243, "learning_rate": 7.759103641456583e-05, "loss": 1.9636, "step": 883 }, { "epoch": 0.22, "grad_norm": 0.2672248234251033, "learning_rate": 7.756557168321874e-05, "loss": 1.7793, "step": 884 }, { "epoch": 0.23, "grad_norm": 0.24691604804164172, "learning_rate": 7.754010695187165e-05, "loss": 2.0508, "step": 885 }, { "epoch": 0.23, "grad_norm": 0.24205617266569543, "learning_rate": 7.751464222052458e-05, "loss": 1.9556, "step": 886 }, { "epoch": 0.23, "grad_norm": 0.23950097320076194, "learning_rate": 7.748917748917749e-05, "loss": 1.8823, "step": 887 }, { "epoch": 0.23, "grad_norm": 0.23454313605150898, "learning_rate": 7.74637127578304e-05, "loss": 2.0957, "step": 888 }, { "epoch": 0.23, "grad_norm": 0.2518741276822946, "learning_rate": 7.743824802648333e-05, "loss": 1.984, "step": 889 }, { "epoch": 0.23, "grad_norm": 0.23913511906105006, "learning_rate": 7.741278329513624e-05, "loss": 1.6676, "step": 890 }, { "epoch": 0.23, "grad_norm": 0.23528027543073698, "learning_rate": 7.738731856378915e-05, "loss": 1.6761, "step": 891 }, { "epoch": 0.23, "grad_norm": 0.22746885632778302, "learning_rate": 7.736185383244207e-05, "loss": 1.9228, "step": 892 }, { "epoch": 0.23, "grad_norm": 0.2389799489749061, "learning_rate": 7.733638910109499e-05, "loss": 1.9622, "step": 893 }, { "epoch": 0.23, "grad_norm": 0.22927869240513388, "learning_rate": 7.73109243697479e-05, "loss": 1.7597, "step": 894 }, { "epoch": 0.23, "grad_norm": 0.24275816319261406, "learning_rate": 7.728545963840082e-05, "loss": 1.9038, "step": 895 }, { "epoch": 0.23, "grad_norm": 0.23010774399109485, "learning_rate": 7.725999490705373e-05, "loss": 1.9066, "step": 896 }, { "epoch": 0.23, "grad_norm": 0.25068412991723865, "learning_rate": 7.723453017570665e-05, "loss": 1.863, "step": 897 }, { "epoch": 0.23, "grad_norm": 0.23238635392792073, "learning_rate": 7.720906544435957e-05, "loss": 1.7561, "step": 898 }, { "epoch": 0.23, "grad_norm": 0.22639256714411624, "learning_rate": 7.718360071301248e-05, "loss": 1.8246, "step": 899 }, { "epoch": 0.23, "grad_norm": 0.23030938279670504, "learning_rate": 7.71581359816654e-05, "loss": 1.936, "step": 900 }, { "epoch": 0.23, "grad_norm": 0.2617661305141856, "learning_rate": 7.713267125031832e-05, "loss": 1.7431, "step": 901 }, { "epoch": 0.23, "grad_norm": 0.24547250231249076, "learning_rate": 7.710720651897123e-05, "loss": 1.8947, "step": 902 }, { "epoch": 0.23, "grad_norm": 0.2328105609608234, "learning_rate": 7.708174178762414e-05, "loss": 1.7748, "step": 903 }, { "epoch": 0.23, "grad_norm": 0.2186369177899911, "learning_rate": 7.705627705627707e-05, "loss": 1.6294, "step": 904 }, { "epoch": 0.23, "grad_norm": 0.23367491909838675, "learning_rate": 7.703081232492998e-05, "loss": 1.8374, "step": 905 }, { "epoch": 0.23, "grad_norm": 0.208741519321703, "learning_rate": 7.700534759358289e-05, "loss": 1.6709, "step": 906 }, { "epoch": 0.23, "grad_norm": 0.24236544870401433, "learning_rate": 7.69798828622358e-05, "loss": 1.911, "step": 907 }, { "epoch": 0.23, "grad_norm": 0.26782105549040786, "learning_rate": 7.695441813088873e-05, "loss": 2.0718, "step": 908 }, { "epoch": 0.23, "grad_norm": 0.23612976419328677, "learning_rate": 7.692895339954164e-05, "loss": 1.8405, "step": 909 }, { "epoch": 0.23, "grad_norm": 0.24821660099782578, "learning_rate": 7.690348866819455e-05, "loss": 2.1939, "step": 910 }, { "epoch": 0.23, "grad_norm": 0.2448911954899524, "learning_rate": 7.687802393684746e-05, "loss": 2.1059, "step": 911 }, { "epoch": 0.23, "grad_norm": 0.23867366871261234, "learning_rate": 7.685255920550039e-05, "loss": 1.8553, "step": 912 }, { "epoch": 0.23, "grad_norm": 0.2541855083319077, "learning_rate": 7.68270944741533e-05, "loss": 1.7859, "step": 913 }, { "epoch": 0.23, "grad_norm": 0.3405341009591459, "learning_rate": 7.680162974280621e-05, "loss": 1.9631, "step": 914 }, { "epoch": 0.23, "grad_norm": 0.23780941759435273, "learning_rate": 7.677616501145914e-05, "loss": 1.8548, "step": 915 }, { "epoch": 0.23, "grad_norm": 0.2670959667222279, "learning_rate": 7.675070028011205e-05, "loss": 2.1335, "step": 916 }, { "epoch": 0.23, "grad_norm": 0.2149395295702038, "learning_rate": 7.672523554876496e-05, "loss": 1.7969, "step": 917 }, { "epoch": 0.23, "grad_norm": 0.21035019919291034, "learning_rate": 7.669977081741789e-05, "loss": 1.7388, "step": 918 }, { "epoch": 0.23, "grad_norm": 0.26288329988564973, "learning_rate": 7.667430608607079e-05, "loss": 2.0539, "step": 919 }, { "epoch": 0.23, "grad_norm": 0.22031332719255275, "learning_rate": 7.664884135472371e-05, "loss": 1.7581, "step": 920 }, { "epoch": 0.23, "grad_norm": 0.2339150206578499, "learning_rate": 7.662337662337662e-05, "loss": 1.74, "step": 921 }, { "epoch": 0.23, "grad_norm": 0.2909098439332173, "learning_rate": 7.659791189202954e-05, "loss": 2.0619, "step": 922 }, { "epoch": 0.23, "grad_norm": 0.26884009714027207, "learning_rate": 7.657244716068246e-05, "loss": 1.7417, "step": 923 }, { "epoch": 0.24, "grad_norm": 0.2233060923878725, "learning_rate": 7.654698242933537e-05, "loss": 1.7564, "step": 924 }, { "epoch": 0.24, "grad_norm": 0.21819773852088117, "learning_rate": 7.652151769798829e-05, "loss": 1.8333, "step": 925 }, { "epoch": 0.24, "grad_norm": 0.2604109413789183, "learning_rate": 7.649605296664121e-05, "loss": 2.1359, "step": 926 }, { "epoch": 0.24, "grad_norm": 0.2632367252089379, "learning_rate": 7.647058823529411e-05, "loss": 1.8613, "step": 927 }, { "epoch": 0.24, "grad_norm": 0.2586751291202257, "learning_rate": 7.644512350394704e-05, "loss": 1.7107, "step": 928 }, { "epoch": 0.24, "grad_norm": 0.21636884551989266, "learning_rate": 7.641965877259996e-05, "loss": 2.013, "step": 929 }, { "epoch": 0.24, "grad_norm": 0.2280765292524814, "learning_rate": 7.639419404125286e-05, "loss": 1.9611, "step": 930 }, { "epoch": 0.24, "grad_norm": 0.24032350762980376, "learning_rate": 7.636872930990579e-05, "loss": 1.7703, "step": 931 }, { "epoch": 0.24, "grad_norm": 0.24868118103332415, "learning_rate": 7.63432645785587e-05, "loss": 1.9188, "step": 932 }, { "epoch": 0.24, "grad_norm": 0.19774779322088548, "learning_rate": 7.631779984721161e-05, "loss": 1.7019, "step": 933 }, { "epoch": 0.24, "grad_norm": 0.23046990572106194, "learning_rate": 7.629233511586454e-05, "loss": 1.9484, "step": 934 }, { "epoch": 0.24, "grad_norm": 0.24550959723771484, "learning_rate": 7.626687038451745e-05, "loss": 1.7939, "step": 935 }, { "epoch": 0.24, "grad_norm": 0.2809890755314992, "learning_rate": 7.624140565317036e-05, "loss": 1.9596, "step": 936 }, { "epoch": 0.24, "grad_norm": 0.31237752421726567, "learning_rate": 7.621594092182329e-05, "loss": 2.0557, "step": 937 }, { "epoch": 0.24, "grad_norm": 0.23455083926617307, "learning_rate": 7.619047619047618e-05, "loss": 1.617, "step": 938 }, { "epoch": 0.24, "grad_norm": 0.24271565192298242, "learning_rate": 7.616501145912911e-05, "loss": 1.9186, "step": 939 }, { "epoch": 0.24, "grad_norm": 0.3234705161782687, "learning_rate": 7.613954672778204e-05, "loss": 2.2312, "step": 940 }, { "epoch": 0.24, "grad_norm": 0.20938309397638855, "learning_rate": 7.611408199643493e-05, "loss": 1.8662, "step": 941 }, { "epoch": 0.24, "grad_norm": 0.26719664786668124, "learning_rate": 7.608861726508786e-05, "loss": 1.9545, "step": 942 }, { "epoch": 0.24, "grad_norm": 0.24819446288048688, "learning_rate": 7.606315253374077e-05, "loss": 1.9658, "step": 943 }, { "epoch": 0.24, "grad_norm": 0.2756837137447019, "learning_rate": 7.603768780239368e-05, "loss": 1.7558, "step": 944 }, { "epoch": 0.24, "grad_norm": 0.242656553492998, "learning_rate": 7.601222307104661e-05, "loss": 2.151, "step": 945 }, { "epoch": 0.24, "grad_norm": 0.26971822970237047, "learning_rate": 7.598675833969952e-05, "loss": 1.9022, "step": 946 }, { "epoch": 0.24, "grad_norm": 0.2519256813420929, "learning_rate": 7.596129360835243e-05, "loss": 1.9729, "step": 947 }, { "epoch": 0.24, "grad_norm": 0.29239166643883435, "learning_rate": 7.593582887700536e-05, "loss": 2.0671, "step": 948 }, { "epoch": 0.24, "grad_norm": 0.25956244876566215, "learning_rate": 7.591036414565826e-05, "loss": 2.201, "step": 949 }, { "epoch": 0.24, "grad_norm": 0.27114146689378416, "learning_rate": 7.588489941431118e-05, "loss": 2.095, "step": 950 }, { "epoch": 0.24, "grad_norm": 0.25017787148928233, "learning_rate": 7.585943468296411e-05, "loss": 1.9677, "step": 951 }, { "epoch": 0.24, "grad_norm": 0.21798966454418292, "learning_rate": 7.583396995161701e-05, "loss": 1.9648, "step": 952 }, { "epoch": 0.24, "grad_norm": 0.3408898443872379, "learning_rate": 7.580850522026993e-05, "loss": 1.8321, "step": 953 }, { "epoch": 0.24, "grad_norm": 0.22591479387853222, "learning_rate": 7.578304048892285e-05, "loss": 1.9954, "step": 954 }, { "epoch": 0.24, "grad_norm": 0.2328924736172243, "learning_rate": 7.575757575757576e-05, "loss": 1.9178, "step": 955 }, { "epoch": 0.24, "grad_norm": 0.2173435089965625, "learning_rate": 7.573211102622868e-05, "loss": 1.7205, "step": 956 }, { "epoch": 0.24, "grad_norm": 0.2456469263841274, "learning_rate": 7.57066462948816e-05, "loss": 1.6118, "step": 957 }, { "epoch": 0.24, "grad_norm": 0.2887778298066372, "learning_rate": 7.568118156353451e-05, "loss": 2.1119, "step": 958 }, { "epoch": 0.24, "grad_norm": 0.242980964697057, "learning_rate": 7.565571683218743e-05, "loss": 1.9016, "step": 959 }, { "epoch": 0.24, "grad_norm": 0.38571292428187637, "learning_rate": 7.563025210084033e-05, "loss": 2.0429, "step": 960 }, { "epoch": 0.24, "grad_norm": 0.25195720580588754, "learning_rate": 7.560478736949326e-05, "loss": 1.8982, "step": 961 }, { "epoch": 0.24, "grad_norm": 0.320485347539328, "learning_rate": 7.557932263814617e-05, "loss": 2.1381, "step": 962 }, { "epoch": 0.25, "grad_norm": 0.2251185492296154, "learning_rate": 7.555385790679908e-05, "loss": 1.8812, "step": 963 }, { "epoch": 0.25, "grad_norm": 0.2365123251612771, "learning_rate": 7.552839317545201e-05, "loss": 1.6989, "step": 964 }, { "epoch": 0.25, "grad_norm": 0.2251034485574401, "learning_rate": 7.550292844410492e-05, "loss": 1.5589, "step": 965 }, { "epoch": 0.25, "grad_norm": 0.1988388721881555, "learning_rate": 7.547746371275783e-05, "loss": 1.8252, "step": 966 }, { "epoch": 0.25, "grad_norm": 0.28923282244887305, "learning_rate": 7.545199898141076e-05, "loss": 2.1709, "step": 967 }, { "epoch": 0.25, "grad_norm": 0.2132909755096321, "learning_rate": 7.542653425006367e-05, "loss": 1.7329, "step": 968 }, { "epoch": 0.25, "grad_norm": 0.22508147837329856, "learning_rate": 7.540106951871658e-05, "loss": 1.5974, "step": 969 }, { "epoch": 0.25, "grad_norm": 0.2600908323181835, "learning_rate": 7.537560478736949e-05, "loss": 2.1381, "step": 970 }, { "epoch": 0.25, "grad_norm": 0.322659985241271, "learning_rate": 7.53501400560224e-05, "loss": 1.8803, "step": 971 }, { "epoch": 0.25, "grad_norm": 0.2474414841879655, "learning_rate": 7.532467532467533e-05, "loss": 1.8566, "step": 972 }, { "epoch": 0.25, "grad_norm": 0.25302935076454597, "learning_rate": 7.529921059332824e-05, "loss": 1.9085, "step": 973 }, { "epoch": 0.25, "grad_norm": 0.22927825371223473, "learning_rate": 7.527374586198115e-05, "loss": 1.5849, "step": 974 }, { "epoch": 0.25, "grad_norm": 0.23052096987578194, "learning_rate": 7.524828113063408e-05, "loss": 1.7076, "step": 975 }, { "epoch": 0.25, "grad_norm": 0.22751792514838975, "learning_rate": 7.522281639928699e-05, "loss": 1.7537, "step": 976 }, { "epoch": 0.25, "grad_norm": 0.2147746581907753, "learning_rate": 7.51973516679399e-05, "loss": 1.741, "step": 977 }, { "epoch": 0.25, "grad_norm": 0.24550996140637407, "learning_rate": 7.517188693659283e-05, "loss": 1.784, "step": 978 }, { "epoch": 0.25, "grad_norm": 0.2880578625463606, "learning_rate": 7.514642220524574e-05, "loss": 2.117, "step": 979 }, { "epoch": 0.25, "grad_norm": 0.2504203808461966, "learning_rate": 7.512095747389865e-05, "loss": 1.7171, "step": 980 }, { "epoch": 0.25, "grad_norm": 0.28360856072709995, "learning_rate": 7.509549274255157e-05, "loss": 1.8599, "step": 981 }, { "epoch": 0.25, "grad_norm": 0.27146575369882403, "learning_rate": 7.507002801120448e-05, "loss": 1.873, "step": 982 }, { "epoch": 0.25, "grad_norm": 0.24146670322185135, "learning_rate": 7.50445632798574e-05, "loss": 1.6318, "step": 983 }, { "epoch": 0.25, "grad_norm": 0.22634116410765562, "learning_rate": 7.501909854851032e-05, "loss": 1.7617, "step": 984 }, { "epoch": 0.25, "grad_norm": 0.22952185094142133, "learning_rate": 7.499363381716323e-05, "loss": 1.8974, "step": 985 }, { "epoch": 0.25, "grad_norm": 0.25222705764804615, "learning_rate": 7.496816908581615e-05, "loss": 1.6948, "step": 986 }, { "epoch": 0.25, "grad_norm": 0.25059176978610787, "learning_rate": 7.494270435446907e-05, "loss": 1.8281, "step": 987 }, { "epoch": 0.25, "grad_norm": 0.23918773636957016, "learning_rate": 7.491723962312198e-05, "loss": 1.6135, "step": 988 }, { "epoch": 0.25, "grad_norm": 0.2602172596736344, "learning_rate": 7.489177489177489e-05, "loss": 1.861, "step": 989 }, { "epoch": 0.25, "grad_norm": 0.23004126099624014, "learning_rate": 7.486631016042782e-05, "loss": 1.9447, "step": 990 }, { "epoch": 0.25, "grad_norm": 0.22437029844262085, "learning_rate": 7.484084542908073e-05, "loss": 1.9113, "step": 991 }, { "epoch": 0.25, "grad_norm": 0.22822509947135397, "learning_rate": 7.481538069773364e-05, "loss": 1.7565, "step": 992 }, { "epoch": 0.25, "grad_norm": 0.22656387295800237, "learning_rate": 7.478991596638657e-05, "loss": 1.5697, "step": 993 }, { "epoch": 0.25, "grad_norm": 0.2272064577690763, "learning_rate": 7.476445123503948e-05, "loss": 1.8419, "step": 994 }, { "epoch": 0.25, "grad_norm": 0.2374931027013204, "learning_rate": 7.473898650369239e-05, "loss": 1.8254, "step": 995 }, { "epoch": 0.25, "grad_norm": 0.2804124733189521, "learning_rate": 7.47135217723453e-05, "loss": 1.9465, "step": 996 }, { "epoch": 0.25, "grad_norm": 0.21066527991753883, "learning_rate": 7.468805704099821e-05, "loss": 1.9101, "step": 997 }, { "epoch": 0.25, "grad_norm": 0.24474048924987085, "learning_rate": 7.466259230965114e-05, "loss": 1.5658, "step": 998 }, { "epoch": 0.25, "grad_norm": 0.22675961108484224, "learning_rate": 7.463712757830405e-05, "loss": 1.9399, "step": 999 }, { "epoch": 0.25, "grad_norm": 0.24537736830780577, "learning_rate": 7.461166284695696e-05, "loss": 1.9266, "step": 1000 }, { "epoch": 0.25, "grad_norm": 0.2873054405840476, "learning_rate": 7.458619811560989e-05, "loss": 1.8692, "step": 1001 }, { "epoch": 0.26, "grad_norm": 0.26074663079865634, "learning_rate": 7.45607333842628e-05, "loss": 1.6851, "step": 1002 }, { "epoch": 0.26, "grad_norm": 0.29185094574968984, "learning_rate": 7.453526865291571e-05, "loss": 1.7844, "step": 1003 }, { "epoch": 0.26, "grad_norm": 0.2667254903977434, "learning_rate": 7.450980392156864e-05, "loss": 1.9728, "step": 1004 }, { "epoch": 0.26, "grad_norm": 0.23175351028139793, "learning_rate": 7.448433919022154e-05, "loss": 1.8085, "step": 1005 }, { "epoch": 0.26, "grad_norm": 0.23395886035315197, "learning_rate": 7.445887445887446e-05, "loss": 1.8175, "step": 1006 }, { "epoch": 0.26, "grad_norm": 0.3028474658033109, "learning_rate": 7.443340972752738e-05, "loss": 2.0443, "step": 1007 }, { "epoch": 0.26, "grad_norm": 0.2580042039617424, "learning_rate": 7.440794499618029e-05, "loss": 1.898, "step": 1008 }, { "epoch": 0.26, "grad_norm": 0.20881989503877474, "learning_rate": 7.438248026483321e-05, "loss": 1.7365, "step": 1009 }, { "epoch": 0.26, "grad_norm": 0.21758219956532768, "learning_rate": 7.435701553348613e-05, "loss": 1.8344, "step": 1010 }, { "epoch": 0.26, "grad_norm": 0.30945228709449574, "learning_rate": 7.433155080213904e-05, "loss": 1.803, "step": 1011 }, { "epoch": 0.26, "grad_norm": 0.2778470482815492, "learning_rate": 7.430608607079196e-05, "loss": 1.9486, "step": 1012 }, { "epoch": 0.26, "grad_norm": 0.27796491719991984, "learning_rate": 7.428062133944486e-05, "loss": 1.9901, "step": 1013 }, { "epoch": 0.26, "grad_norm": 0.2823655601272822, "learning_rate": 7.425515660809779e-05, "loss": 2.0025, "step": 1014 }, { "epoch": 0.26, "grad_norm": 0.21818991040931096, "learning_rate": 7.422969187675071e-05, "loss": 1.824, "step": 1015 }, { "epoch": 0.26, "grad_norm": 0.2402002884167473, "learning_rate": 7.420422714540361e-05, "loss": 1.8006, "step": 1016 }, { "epoch": 0.26, "grad_norm": 0.23589217748546118, "learning_rate": 7.417876241405654e-05, "loss": 1.8656, "step": 1017 }, { "epoch": 0.26, "grad_norm": 0.23281470528140585, "learning_rate": 7.415329768270945e-05, "loss": 1.5889, "step": 1018 }, { "epoch": 0.26, "grad_norm": 0.29580881001459797, "learning_rate": 7.412783295136236e-05, "loss": 1.7995, "step": 1019 }, { "epoch": 0.26, "grad_norm": 0.22434470297566775, "learning_rate": 7.410236822001529e-05, "loss": 1.8127, "step": 1020 }, { "epoch": 0.26, "grad_norm": 0.2914740516799641, "learning_rate": 7.40769034886682e-05, "loss": 2.2744, "step": 1021 }, { "epoch": 0.26, "grad_norm": 0.2742825121529459, "learning_rate": 7.405143875732111e-05, "loss": 1.9129, "step": 1022 }, { "epoch": 0.26, "grad_norm": 0.2334532812955576, "learning_rate": 7.402597402597404e-05, "loss": 1.7953, "step": 1023 }, { "epoch": 0.26, "grad_norm": 0.24967774841210677, "learning_rate": 7.400050929462693e-05, "loss": 1.8679, "step": 1024 }, { "epoch": 0.26, "grad_norm": 0.2595031961508815, "learning_rate": 7.397504456327986e-05, "loss": 1.9454, "step": 1025 }, { "epoch": 0.26, "grad_norm": 0.2521915933685689, "learning_rate": 7.394957983193279e-05, "loss": 2.1809, "step": 1026 }, { "epoch": 0.26, "grad_norm": 0.26113880010800444, "learning_rate": 7.392411510058568e-05, "loss": 1.7928, "step": 1027 }, { "epoch": 0.26, "grad_norm": 0.24503990870092038, "learning_rate": 7.389865036923861e-05, "loss": 1.8939, "step": 1028 }, { "epoch": 0.26, "grad_norm": 0.22210336940722358, "learning_rate": 7.387318563789152e-05, "loss": 1.7962, "step": 1029 }, { "epoch": 0.26, "grad_norm": 0.2868276758158762, "learning_rate": 7.384772090654443e-05, "loss": 2.1045, "step": 1030 }, { "epoch": 0.26, "grad_norm": 0.24116223727815847, "learning_rate": 7.382225617519736e-05, "loss": 1.7254, "step": 1031 }, { "epoch": 0.26, "grad_norm": 0.2789040316966339, "learning_rate": 7.379679144385027e-05, "loss": 1.9977, "step": 1032 }, { "epoch": 0.26, "grad_norm": 0.24868935259042466, "learning_rate": 7.377132671250318e-05, "loss": 1.7413, "step": 1033 }, { "epoch": 0.26, "grad_norm": 0.2558503040132623, "learning_rate": 7.374586198115611e-05, "loss": 1.9594, "step": 1034 }, { "epoch": 0.26, "grad_norm": 0.2372110362556985, "learning_rate": 7.372039724980901e-05, "loss": 1.6742, "step": 1035 }, { "epoch": 0.26, "grad_norm": 0.2215575331170209, "learning_rate": 7.369493251846193e-05, "loss": 1.7478, "step": 1036 }, { "epoch": 0.26, "grad_norm": 0.29277068754588087, "learning_rate": 7.366946778711486e-05, "loss": 1.8855, "step": 1037 }, { "epoch": 0.26, "grad_norm": 0.247853489079965, "learning_rate": 7.364400305576776e-05, "loss": 1.9515, "step": 1038 }, { "epoch": 0.26, "grad_norm": 0.3179033441781071, "learning_rate": 7.361853832442068e-05, "loss": 1.9779, "step": 1039 }, { "epoch": 0.26, "grad_norm": 0.21068073469156284, "learning_rate": 7.35930735930736e-05, "loss": 1.7137, "step": 1040 }, { "epoch": 0.26, "grad_norm": 0.31388598886043856, "learning_rate": 7.356760886172651e-05, "loss": 1.9984, "step": 1041 }, { "epoch": 0.27, "grad_norm": 0.26587855635014485, "learning_rate": 7.354214413037943e-05, "loss": 1.9973, "step": 1042 }, { "epoch": 0.27, "grad_norm": 0.2843791746524781, "learning_rate": 7.351667939903235e-05, "loss": 2.2078, "step": 1043 }, { "epoch": 0.27, "grad_norm": 0.22613839212935044, "learning_rate": 7.349121466768526e-05, "loss": 2.0572, "step": 1044 }, { "epoch": 0.27, "grad_norm": 0.24078092198779658, "learning_rate": 7.346574993633818e-05, "loss": 2.0019, "step": 1045 }, { "epoch": 0.27, "grad_norm": 0.22878717117869515, "learning_rate": 7.344028520499108e-05, "loss": 2.0114, "step": 1046 }, { "epoch": 0.27, "grad_norm": 0.28529572668418474, "learning_rate": 7.341482047364401e-05, "loss": 2.0328, "step": 1047 }, { "epoch": 0.27, "grad_norm": 0.3037342908902211, "learning_rate": 7.338935574229692e-05, "loss": 1.9793, "step": 1048 }, { "epoch": 0.27, "grad_norm": 0.2976567866603389, "learning_rate": 7.336389101094983e-05, "loss": 2.0043, "step": 1049 }, { "epoch": 0.27, "grad_norm": 0.22623327604414598, "learning_rate": 7.333842627960276e-05, "loss": 1.8501, "step": 1050 }, { "epoch": 0.27, "grad_norm": 0.22202210720923746, "learning_rate": 7.331296154825567e-05, "loss": 1.7602, "step": 1051 }, { "epoch": 0.27, "grad_norm": 0.25556327342706764, "learning_rate": 7.328749681690858e-05, "loss": 1.9428, "step": 1052 }, { "epoch": 0.27, "grad_norm": 0.30017478291905664, "learning_rate": 7.326203208556151e-05, "loss": 1.9114, "step": 1053 }, { "epoch": 0.27, "grad_norm": 0.2530886618115332, "learning_rate": 7.323656735421442e-05, "loss": 1.6587, "step": 1054 }, { "epoch": 0.27, "grad_norm": 0.26379455255653006, "learning_rate": 7.321110262286733e-05, "loss": 1.8647, "step": 1055 }, { "epoch": 0.27, "grad_norm": 0.30701511136802717, "learning_rate": 7.318563789152026e-05, "loss": 2.1388, "step": 1056 }, { "epoch": 0.27, "grad_norm": 0.2851724881291049, "learning_rate": 7.316017316017317e-05, "loss": 1.888, "step": 1057 }, { "epoch": 0.27, "grad_norm": 0.26338541930182763, "learning_rate": 7.313470842882608e-05, "loss": 2.1868, "step": 1058 }, { "epoch": 0.27, "grad_norm": 0.24415074899663258, "learning_rate": 7.310924369747899e-05, "loss": 1.6828, "step": 1059 }, { "epoch": 0.27, "grad_norm": 0.2765224546330694, "learning_rate": 7.30837789661319e-05, "loss": 1.9021, "step": 1060 }, { "epoch": 0.27, "grad_norm": 0.26298709533401354, "learning_rate": 7.305831423478483e-05, "loss": 1.7669, "step": 1061 }, { "epoch": 0.27, "grad_norm": 0.21868878597983632, "learning_rate": 7.303284950343774e-05, "loss": 1.8156, "step": 1062 }, { "epoch": 0.27, "grad_norm": 0.2383054502111241, "learning_rate": 7.300738477209065e-05, "loss": 1.8936, "step": 1063 }, { "epoch": 0.27, "grad_norm": 0.3029936387854745, "learning_rate": 7.298192004074358e-05, "loss": 2.1997, "step": 1064 }, { "epoch": 0.27, "grad_norm": 0.2240249078331532, "learning_rate": 7.295645530939649e-05, "loss": 1.8388, "step": 1065 }, { "epoch": 0.27, "grad_norm": 0.30748644275552534, "learning_rate": 7.29309905780494e-05, "loss": 2.1551, "step": 1066 }, { "epoch": 0.27, "grad_norm": 0.31235704728599206, "learning_rate": 7.290552584670232e-05, "loss": 2.044, "step": 1067 }, { "epoch": 0.27, "grad_norm": 0.24152539871409162, "learning_rate": 7.288006111535524e-05, "loss": 1.8208, "step": 1068 }, { "epoch": 0.27, "grad_norm": 0.26192652582359693, "learning_rate": 7.285459638400815e-05, "loss": 1.9027, "step": 1069 }, { "epoch": 0.27, "grad_norm": 0.2731294531644669, "learning_rate": 7.282913165266107e-05, "loss": 1.9331, "step": 1070 }, { "epoch": 0.27, "grad_norm": 0.25505936744970453, "learning_rate": 7.280366692131398e-05, "loss": 1.903, "step": 1071 }, { "epoch": 0.27, "grad_norm": 0.2282500476257383, "learning_rate": 7.27782021899669e-05, "loss": 1.7402, "step": 1072 }, { "epoch": 0.27, "grad_norm": 0.2620146487850985, "learning_rate": 7.275273745861982e-05, "loss": 1.9199, "step": 1073 }, { "epoch": 0.27, "grad_norm": 0.23315122911921574, "learning_rate": 7.272727272727273e-05, "loss": 1.7629, "step": 1074 }, { "epoch": 0.27, "grad_norm": 0.23948199609946433, "learning_rate": 7.270180799592564e-05, "loss": 1.7816, "step": 1075 }, { "epoch": 0.27, "grad_norm": 0.21465318204237038, "learning_rate": 7.267634326457857e-05, "loss": 1.7512, "step": 1076 }, { "epoch": 0.27, "grad_norm": 0.27432777704428774, "learning_rate": 7.265087853323148e-05, "loss": 1.8063, "step": 1077 }, { "epoch": 0.27, "grad_norm": 0.2613009919134964, "learning_rate": 7.262541380188439e-05, "loss": 1.9957, "step": 1078 }, { "epoch": 0.27, "grad_norm": 0.24222462892491128, "learning_rate": 7.259994907053732e-05, "loss": 1.9051, "step": 1079 }, { "epoch": 0.27, "grad_norm": 0.23796910023553916, "learning_rate": 7.257448433919023e-05, "loss": 1.6754, "step": 1080 }, { "epoch": 0.28, "grad_norm": 0.27225403124782055, "learning_rate": 7.254901960784314e-05, "loss": 1.9675, "step": 1081 }, { "epoch": 0.28, "grad_norm": 0.23667078161663896, "learning_rate": 7.252355487649605e-05, "loss": 1.8707, "step": 1082 }, { "epoch": 0.28, "grad_norm": 0.24004472053880488, "learning_rate": 7.249809014514896e-05, "loss": 1.6981, "step": 1083 }, { "epoch": 0.28, "grad_norm": 0.2393421569384702, "learning_rate": 7.247262541380189e-05, "loss": 1.8638, "step": 1084 }, { "epoch": 0.28, "grad_norm": 0.24813366670630538, "learning_rate": 7.24471606824548e-05, "loss": 2.007, "step": 1085 }, { "epoch": 0.28, "grad_norm": 0.25643140605516485, "learning_rate": 7.242169595110771e-05, "loss": 2.1365, "step": 1086 }, { "epoch": 0.28, "grad_norm": 0.2989812844911387, "learning_rate": 7.239623121976064e-05, "loss": 1.7626, "step": 1087 }, { "epoch": 0.28, "grad_norm": 0.2979038380259418, "learning_rate": 7.237076648841355e-05, "loss": 2.0305, "step": 1088 }, { "epoch": 0.28, "grad_norm": 0.24981192130202395, "learning_rate": 7.234530175706646e-05, "loss": 1.6851, "step": 1089 }, { "epoch": 0.28, "grad_norm": 0.24425436040945306, "learning_rate": 7.231983702571939e-05, "loss": 1.8165, "step": 1090 }, { "epoch": 0.28, "grad_norm": 0.2700889456790734, "learning_rate": 7.229437229437229e-05, "loss": 2.0746, "step": 1091 }, { "epoch": 0.28, "grad_norm": 0.3365204875725978, "learning_rate": 7.226890756302521e-05, "loss": 1.7851, "step": 1092 }, { "epoch": 0.28, "grad_norm": 0.2675215111309902, "learning_rate": 7.224344283167813e-05, "loss": 1.8756, "step": 1093 }, { "epoch": 0.28, "grad_norm": 0.25048149947416654, "learning_rate": 7.221797810033104e-05, "loss": 1.7192, "step": 1094 }, { "epoch": 0.28, "grad_norm": 0.23477026669805706, "learning_rate": 7.219251336898396e-05, "loss": 1.859, "step": 1095 }, { "epoch": 0.28, "grad_norm": 0.28073662687213674, "learning_rate": 7.216704863763688e-05, "loss": 1.8868, "step": 1096 }, { "epoch": 0.28, "grad_norm": 0.2989689364308601, "learning_rate": 7.214158390628979e-05, "loss": 1.8827, "step": 1097 }, { "epoch": 0.28, "grad_norm": 0.2810212105436412, "learning_rate": 7.211611917494271e-05, "loss": 2.016, "step": 1098 }, { "epoch": 0.28, "grad_norm": 0.2543432436622935, "learning_rate": 7.209065444359563e-05, "loss": 1.7812, "step": 1099 }, { "epoch": 0.28, "grad_norm": 0.3613659400741098, "learning_rate": 7.206518971224854e-05, "loss": 2.0044, "step": 1100 }, { "epoch": 0.28, "grad_norm": 0.22297225245001823, "learning_rate": 7.203972498090146e-05, "loss": 1.6883, "step": 1101 }, { "epoch": 0.28, "grad_norm": 0.23228286946036356, "learning_rate": 7.201426024955436e-05, "loss": 1.8655, "step": 1102 }, { "epoch": 0.28, "grad_norm": 0.25222815059535875, "learning_rate": 7.198879551820729e-05, "loss": 1.9635, "step": 1103 }, { "epoch": 0.28, "grad_norm": 0.26860423237937797, "learning_rate": 7.196333078686021e-05, "loss": 2.0223, "step": 1104 }, { "epoch": 0.28, "grad_norm": 0.22766323548846765, "learning_rate": 7.193786605551311e-05, "loss": 1.8319, "step": 1105 }, { "epoch": 0.28, "grad_norm": 0.26320673570112335, "learning_rate": 7.191240132416604e-05, "loss": 1.8235, "step": 1106 }, { "epoch": 0.28, "grad_norm": 0.2472466981342383, "learning_rate": 7.188693659281895e-05, "loss": 1.8836, "step": 1107 }, { "epoch": 0.28, "grad_norm": 0.2930360716717708, "learning_rate": 7.186147186147186e-05, "loss": 2.036, "step": 1108 }, { "epoch": 0.28, "grad_norm": 0.29167954666218077, "learning_rate": 7.183600713012479e-05, "loss": 1.9315, "step": 1109 }, { "epoch": 0.28, "grad_norm": 0.2794854387898721, "learning_rate": 7.181054239877769e-05, "loss": 1.9765, "step": 1110 }, { "epoch": 0.28, "grad_norm": 0.23546131629828576, "learning_rate": 7.178507766743061e-05, "loss": 1.7479, "step": 1111 }, { "epoch": 0.28, "grad_norm": 0.2848010921499858, "learning_rate": 7.175961293608354e-05, "loss": 2.0008, "step": 1112 }, { "epoch": 0.28, "grad_norm": 0.23285082495229736, "learning_rate": 7.173414820473643e-05, "loss": 1.7227, "step": 1113 }, { "epoch": 0.28, "grad_norm": 0.23277238653860835, "learning_rate": 7.170868347338936e-05, "loss": 1.7846, "step": 1114 }, { "epoch": 0.28, "grad_norm": 0.24855975945486786, "learning_rate": 7.168321874204229e-05, "loss": 1.8656, "step": 1115 }, { "epoch": 0.28, "grad_norm": 0.23104977509797836, "learning_rate": 7.165775401069518e-05, "loss": 1.84, "step": 1116 }, { "epoch": 0.28, "grad_norm": 0.2543793452264319, "learning_rate": 7.163228927934811e-05, "loss": 1.9373, "step": 1117 }, { "epoch": 0.28, "grad_norm": 0.29724953265750936, "learning_rate": 7.160682454800102e-05, "loss": 2.0002, "step": 1118 }, { "epoch": 0.28, "grad_norm": 0.25150347662887024, "learning_rate": 7.158135981665393e-05, "loss": 2.0944, "step": 1119 }, { "epoch": 0.29, "grad_norm": 0.30103818645027586, "learning_rate": 7.155589508530686e-05, "loss": 2.1487, "step": 1120 }, { "epoch": 0.29, "grad_norm": 0.2686274065919044, "learning_rate": 7.153043035395977e-05, "loss": 2.0224, "step": 1121 }, { "epoch": 0.29, "grad_norm": 0.2566219497226242, "learning_rate": 7.150496562261268e-05, "loss": 1.6538, "step": 1122 }, { "epoch": 0.29, "grad_norm": 0.25612670095278256, "learning_rate": 7.147950089126561e-05, "loss": 1.7372, "step": 1123 }, { "epoch": 0.29, "grad_norm": 0.2550233621507488, "learning_rate": 7.145403615991851e-05, "loss": 2.0321, "step": 1124 }, { "epoch": 0.29, "grad_norm": 0.24179696312626972, "learning_rate": 7.142857142857143e-05, "loss": 1.7023, "step": 1125 }, { "epoch": 0.29, "grad_norm": 0.2377412493034981, "learning_rate": 7.140310669722435e-05, "loss": 1.9684, "step": 1126 }, { "epoch": 0.29, "grad_norm": 0.23372052525767906, "learning_rate": 7.137764196587726e-05, "loss": 1.787, "step": 1127 }, { "epoch": 0.29, "grad_norm": 0.1985849320909872, "learning_rate": 7.135217723453018e-05, "loss": 1.716, "step": 1128 }, { "epoch": 0.29, "grad_norm": 0.2725190541622667, "learning_rate": 7.13267125031831e-05, "loss": 2.0011, "step": 1129 }, { "epoch": 0.29, "grad_norm": 0.20734841865394643, "learning_rate": 7.130124777183601e-05, "loss": 1.7306, "step": 1130 }, { "epoch": 0.29, "grad_norm": 0.24358986338218003, "learning_rate": 7.127578304048893e-05, "loss": 1.655, "step": 1131 }, { "epoch": 0.29, "grad_norm": 0.2673702685671822, "learning_rate": 7.125031830914185e-05, "loss": 2.3778, "step": 1132 }, { "epoch": 0.29, "grad_norm": 0.2530655072787947, "learning_rate": 7.122485357779476e-05, "loss": 1.8649, "step": 1133 }, { "epoch": 0.29, "grad_norm": 0.32523596569646346, "learning_rate": 7.119938884644768e-05, "loss": 2.0269, "step": 1134 }, { "epoch": 0.29, "grad_norm": 0.26169970428856776, "learning_rate": 7.117392411510058e-05, "loss": 1.8534, "step": 1135 }, { "epoch": 0.29, "grad_norm": 0.3037747135234151, "learning_rate": 7.114845938375351e-05, "loss": 2.1786, "step": 1136 }, { "epoch": 0.29, "grad_norm": 0.2299678580355851, "learning_rate": 7.112299465240642e-05, "loss": 1.8511, "step": 1137 }, { "epoch": 0.29, "grad_norm": 0.23206102584752836, "learning_rate": 7.109752992105933e-05, "loss": 1.9227, "step": 1138 }, { "epoch": 0.29, "grad_norm": 0.2537273657151455, "learning_rate": 7.107206518971226e-05, "loss": 1.8559, "step": 1139 }, { "epoch": 0.29, "grad_norm": 0.23767428403068216, "learning_rate": 7.104660045836517e-05, "loss": 2.222, "step": 1140 }, { "epoch": 0.29, "grad_norm": 0.23592912078024372, "learning_rate": 7.102113572701808e-05, "loss": 1.8312, "step": 1141 }, { "epoch": 0.29, "grad_norm": 0.30055754876376845, "learning_rate": 7.099567099567101e-05, "loss": 1.8955, "step": 1142 }, { "epoch": 0.29, "grad_norm": 0.265817656493249, "learning_rate": 7.097020626432392e-05, "loss": 1.9888, "step": 1143 }, { "epoch": 0.29, "grad_norm": 0.25581988546790446, "learning_rate": 7.094474153297683e-05, "loss": 2.0585, "step": 1144 }, { "epoch": 0.29, "grad_norm": 0.2462609495110378, "learning_rate": 7.091927680162974e-05, "loss": 1.7901, "step": 1145 }, { "epoch": 0.29, "grad_norm": 0.2748599112846291, "learning_rate": 7.089381207028266e-05, "loss": 1.9851, "step": 1146 }, { "epoch": 0.29, "grad_norm": 0.2309607894623946, "learning_rate": 7.086834733893558e-05, "loss": 2.0558, "step": 1147 }, { "epoch": 0.29, "grad_norm": 0.24386882147790764, "learning_rate": 7.084288260758849e-05, "loss": 1.7379, "step": 1148 }, { "epoch": 0.29, "grad_norm": 0.27708527784214965, "learning_rate": 7.08174178762414e-05, "loss": 1.8403, "step": 1149 }, { "epoch": 0.29, "grad_norm": 0.3354518396089165, "learning_rate": 7.079195314489433e-05, "loss": 1.7877, "step": 1150 }, { "epoch": 0.29, "grad_norm": 0.22490205354791917, "learning_rate": 7.076648841354724e-05, "loss": 1.6953, "step": 1151 }, { "epoch": 0.29, "grad_norm": 0.25284629536951586, "learning_rate": 7.074102368220015e-05, "loss": 1.8155, "step": 1152 }, { "epoch": 0.29, "grad_norm": 0.2068117165364758, "learning_rate": 7.071555895085307e-05, "loss": 1.6563, "step": 1153 }, { "epoch": 0.29, "grad_norm": 0.22447663406807988, "learning_rate": 7.069009421950599e-05, "loss": 1.8194, "step": 1154 }, { "epoch": 0.29, "grad_norm": 0.23613657163494348, "learning_rate": 7.06646294881589e-05, "loss": 1.9006, "step": 1155 }, { "epoch": 0.29, "grad_norm": 0.24702024894989047, "learning_rate": 7.063916475681182e-05, "loss": 2.1066, "step": 1156 }, { "epoch": 0.29, "grad_norm": 0.3056091771193659, "learning_rate": 7.061370002546473e-05, "loss": 1.9301, "step": 1157 }, { "epoch": 0.29, "grad_norm": 0.2747597332958069, "learning_rate": 7.058823529411765e-05, "loss": 2.1646, "step": 1158 }, { "epoch": 0.29, "grad_norm": 0.24583240285929006, "learning_rate": 7.056277056277057e-05, "loss": 1.9277, "step": 1159 }, { "epoch": 0.3, "grad_norm": 0.331506392830848, "learning_rate": 7.053730583142348e-05, "loss": 2.0941, "step": 1160 }, { "epoch": 0.3, "grad_norm": 0.3055671318828273, "learning_rate": 7.051184110007639e-05, "loss": 1.7977, "step": 1161 }, { "epoch": 0.3, "grad_norm": 0.23832312200391165, "learning_rate": 7.048637636872932e-05, "loss": 1.8091, "step": 1162 }, { "epoch": 0.3, "grad_norm": 0.24203766525563156, "learning_rate": 7.046091163738223e-05, "loss": 1.7537, "step": 1163 }, { "epoch": 0.3, "grad_norm": 0.2523149952286734, "learning_rate": 7.043544690603514e-05, "loss": 1.7045, "step": 1164 }, { "epoch": 0.3, "grad_norm": 0.24369301007467567, "learning_rate": 7.040998217468807e-05, "loss": 1.7281, "step": 1165 }, { "epoch": 0.3, "grad_norm": 0.2259776689902922, "learning_rate": 7.038451744334098e-05, "loss": 1.5897, "step": 1166 }, { "epoch": 0.3, "grad_norm": 0.2994312483004138, "learning_rate": 7.035905271199389e-05, "loss": 1.9248, "step": 1167 }, { "epoch": 0.3, "grad_norm": 0.2526797086621746, "learning_rate": 7.03335879806468e-05, "loss": 1.7435, "step": 1168 }, { "epoch": 0.3, "grad_norm": 0.23277674758895478, "learning_rate": 7.030812324929971e-05, "loss": 1.72, "step": 1169 }, { "epoch": 0.3, "grad_norm": 0.26770775154189086, "learning_rate": 7.028265851795264e-05, "loss": 1.7953, "step": 1170 }, { "epoch": 0.3, "grad_norm": 0.2760582265145269, "learning_rate": 7.025719378660555e-05, "loss": 1.9682, "step": 1171 }, { "epoch": 0.3, "grad_norm": 0.2984211962176011, "learning_rate": 7.023172905525846e-05, "loss": 1.8924, "step": 1172 }, { "epoch": 0.3, "grad_norm": 0.2490923393519683, "learning_rate": 7.020626432391139e-05, "loss": 2.0529, "step": 1173 }, { "epoch": 0.3, "grad_norm": 0.23055860416536486, "learning_rate": 7.01807995925643e-05, "loss": 2.0508, "step": 1174 }, { "epoch": 0.3, "grad_norm": 0.24071860164918554, "learning_rate": 7.015533486121721e-05, "loss": 1.7432, "step": 1175 }, { "epoch": 0.3, "grad_norm": 0.23464291677367424, "learning_rate": 7.012987012987014e-05, "loss": 1.8712, "step": 1176 }, { "epoch": 0.3, "grad_norm": 0.25136838377673104, "learning_rate": 7.010440539852305e-05, "loss": 1.8357, "step": 1177 }, { "epoch": 0.3, "grad_norm": 0.23352658572465643, "learning_rate": 7.007894066717596e-05, "loss": 1.9611, "step": 1178 }, { "epoch": 0.3, "grad_norm": 0.2741285583327767, "learning_rate": 7.005347593582889e-05, "loss": 1.9042, "step": 1179 }, { "epoch": 0.3, "grad_norm": 0.23900251196893887, "learning_rate": 7.002801120448179e-05, "loss": 1.9515, "step": 1180 }, { "epoch": 0.3, "grad_norm": 0.2060822834500915, "learning_rate": 7.000254647313471e-05, "loss": 1.6298, "step": 1181 }, { "epoch": 0.3, "grad_norm": 0.23316114325732196, "learning_rate": 6.997708174178763e-05, "loss": 1.6614, "step": 1182 }, { "epoch": 0.3, "grad_norm": 0.26262615669137385, "learning_rate": 6.995161701044054e-05, "loss": 1.9487, "step": 1183 }, { "epoch": 0.3, "grad_norm": 0.2601703130226013, "learning_rate": 6.992615227909346e-05, "loss": 1.9944, "step": 1184 }, { "epoch": 0.3, "grad_norm": 0.2698487333136885, "learning_rate": 6.990068754774638e-05, "loss": 2.1823, "step": 1185 }, { "epoch": 0.3, "grad_norm": 0.2781873686650244, "learning_rate": 6.987522281639929e-05, "loss": 2.2884, "step": 1186 }, { "epoch": 0.3, "grad_norm": 0.2159810309276435, "learning_rate": 6.984975808505221e-05, "loss": 1.5913, "step": 1187 }, { "epoch": 0.3, "grad_norm": 0.28527333194417825, "learning_rate": 6.982429335370511e-05, "loss": 1.8822, "step": 1188 }, { "epoch": 0.3, "grad_norm": 0.23135515863468417, "learning_rate": 6.979882862235804e-05, "loss": 1.7036, "step": 1189 }, { "epoch": 0.3, "grad_norm": 0.24407147763453632, "learning_rate": 6.977336389101096e-05, "loss": 1.7656, "step": 1190 }, { "epoch": 0.3, "grad_norm": 0.27642317565367014, "learning_rate": 6.974789915966386e-05, "loss": 1.8132, "step": 1191 }, { "epoch": 0.3, "grad_norm": 0.3566106334497433, "learning_rate": 6.972243442831679e-05, "loss": 2.1099, "step": 1192 }, { "epoch": 0.3, "grad_norm": 0.3242742766665087, "learning_rate": 6.96969696969697e-05, "loss": 2.0686, "step": 1193 }, { "epoch": 0.3, "grad_norm": 0.2382209576481061, "learning_rate": 6.967150496562261e-05, "loss": 1.6727, "step": 1194 }, { "epoch": 0.3, "grad_norm": 0.24684876471560604, "learning_rate": 6.964604023427554e-05, "loss": 1.9016, "step": 1195 }, { "epoch": 0.3, "grad_norm": 0.2424328622470283, "learning_rate": 6.962057550292845e-05, "loss": 1.6665, "step": 1196 }, { "epoch": 0.3, "grad_norm": 0.21481593558855225, "learning_rate": 6.959511077158136e-05, "loss": 1.5815, "step": 1197 }, { "epoch": 0.3, "grad_norm": 0.26915990430990216, "learning_rate": 6.956964604023429e-05, "loss": 2.0001, "step": 1198 }, { "epoch": 0.31, "grad_norm": 0.24361440787524696, "learning_rate": 6.954418130888719e-05, "loss": 1.7438, "step": 1199 }, { "epoch": 0.31, "grad_norm": 0.25051539164691516, "learning_rate": 6.951871657754011e-05, "loss": 1.7314, "step": 1200 }, { "epoch": 0.31, "grad_norm": 0.2484958370261083, "learning_rate": 6.949325184619304e-05, "loss": 1.9663, "step": 1201 }, { "epoch": 0.31, "grad_norm": 0.255854613864076, "learning_rate": 6.946778711484593e-05, "loss": 1.9106, "step": 1202 }, { "epoch": 0.31, "grad_norm": 0.22296479246373885, "learning_rate": 6.944232238349886e-05, "loss": 1.753, "step": 1203 }, { "epoch": 0.31, "grad_norm": 0.2380491361269653, "learning_rate": 6.941685765215177e-05, "loss": 1.5306, "step": 1204 }, { "epoch": 0.31, "grad_norm": 0.2523746365284298, "learning_rate": 6.939139292080468e-05, "loss": 1.7611, "step": 1205 }, { "epoch": 0.31, "grad_norm": 0.2216426297441028, "learning_rate": 6.936592818945761e-05, "loss": 1.8845, "step": 1206 }, { "epoch": 0.31, "grad_norm": 0.21847890335976045, "learning_rate": 6.934046345811052e-05, "loss": 1.6605, "step": 1207 }, { "epoch": 0.31, "grad_norm": 0.26162861937449305, "learning_rate": 6.931499872676343e-05, "loss": 2.0477, "step": 1208 }, { "epoch": 0.31, "grad_norm": 0.2900063467358922, "learning_rate": 6.928953399541636e-05, "loss": 1.9684, "step": 1209 }, { "epoch": 0.31, "grad_norm": 0.22609536749703432, "learning_rate": 6.926406926406926e-05, "loss": 1.6459, "step": 1210 }, { "epoch": 0.31, "grad_norm": 0.29145205498250837, "learning_rate": 6.923860453272218e-05, "loss": 2.1166, "step": 1211 }, { "epoch": 0.31, "grad_norm": 0.2553795253317233, "learning_rate": 6.921313980137511e-05, "loss": 1.9199, "step": 1212 }, { "epoch": 0.31, "grad_norm": 0.25504394348961323, "learning_rate": 6.918767507002801e-05, "loss": 1.9561, "step": 1213 }, { "epoch": 0.31, "grad_norm": 0.29217266004279324, "learning_rate": 6.916221033868093e-05, "loss": 2.0036, "step": 1214 }, { "epoch": 0.31, "grad_norm": 0.23271868690970354, "learning_rate": 6.913674560733385e-05, "loss": 1.7754, "step": 1215 }, { "epoch": 0.31, "grad_norm": 0.2711700568417082, "learning_rate": 6.911128087598676e-05, "loss": 1.987, "step": 1216 }, { "epoch": 0.31, "grad_norm": 0.25555659717339474, "learning_rate": 6.908581614463968e-05, "loss": 1.8564, "step": 1217 }, { "epoch": 0.31, "grad_norm": 0.23641129283480852, "learning_rate": 6.90603514132926e-05, "loss": 2.0455, "step": 1218 }, { "epoch": 0.31, "grad_norm": 0.2748935081457799, "learning_rate": 6.903488668194551e-05, "loss": 2.0183, "step": 1219 }, { "epoch": 0.31, "grad_norm": 0.236873454665123, "learning_rate": 6.900942195059843e-05, "loss": 1.728, "step": 1220 }, { "epoch": 0.31, "grad_norm": 0.1917179741723266, "learning_rate": 6.898395721925133e-05, "loss": 1.5001, "step": 1221 }, { "epoch": 0.31, "grad_norm": 0.2337431576563751, "learning_rate": 6.895849248790426e-05, "loss": 1.9901, "step": 1222 }, { "epoch": 0.31, "grad_norm": 0.2622288813801724, "learning_rate": 6.893302775655717e-05, "loss": 1.8896, "step": 1223 }, { "epoch": 0.31, "grad_norm": 0.27413476869769615, "learning_rate": 6.890756302521008e-05, "loss": 1.801, "step": 1224 }, { "epoch": 0.31, "grad_norm": 0.30057635078237704, "learning_rate": 6.888209829386301e-05, "loss": 2.0947, "step": 1225 }, { "epoch": 0.31, "grad_norm": 0.23108785525032752, "learning_rate": 6.885663356251592e-05, "loss": 1.7747, "step": 1226 }, { "epoch": 0.31, "grad_norm": 0.26140156784470325, "learning_rate": 6.883116883116883e-05, "loss": 1.9075, "step": 1227 }, { "epoch": 0.31, "grad_norm": 0.25422769885968144, "learning_rate": 6.880570409982176e-05, "loss": 1.9126, "step": 1228 }, { "epoch": 0.31, "grad_norm": 0.24198688381192876, "learning_rate": 6.878023936847467e-05, "loss": 1.7064, "step": 1229 }, { "epoch": 0.31, "grad_norm": 0.22749023568504842, "learning_rate": 6.875477463712758e-05, "loss": 1.5782, "step": 1230 }, { "epoch": 0.31, "grad_norm": 0.24970716223565978, "learning_rate": 6.87293099057805e-05, "loss": 1.7543, "step": 1231 }, { "epoch": 0.31, "grad_norm": 0.24069863716059806, "learning_rate": 6.87038451744334e-05, "loss": 1.9083, "step": 1232 }, { "epoch": 0.31, "grad_norm": 0.21938296261489496, "learning_rate": 6.867838044308633e-05, "loss": 1.8046, "step": 1233 }, { "epoch": 0.31, "grad_norm": 0.29344740866119606, "learning_rate": 6.865291571173924e-05, "loss": 2.2228, "step": 1234 }, { "epoch": 0.31, "grad_norm": 0.2403872476488659, "learning_rate": 6.862745098039216e-05, "loss": 1.8867, "step": 1235 }, { "epoch": 0.31, "grad_norm": 0.20991083478840128, "learning_rate": 6.860198624904508e-05, "loss": 1.5654, "step": 1236 }, { "epoch": 0.31, "grad_norm": 0.24375700757150615, "learning_rate": 6.857652151769799e-05, "loss": 1.8848, "step": 1237 }, { "epoch": 0.32, "grad_norm": 0.21450749873567568, "learning_rate": 6.85510567863509e-05, "loss": 1.9196, "step": 1238 }, { "epoch": 0.32, "grad_norm": 0.28072297202065793, "learning_rate": 6.852559205500382e-05, "loss": 1.9887, "step": 1239 }, { "epoch": 0.32, "grad_norm": 0.2647998278270836, "learning_rate": 6.850012732365674e-05, "loss": 1.8879, "step": 1240 }, { "epoch": 0.32, "grad_norm": 0.31102485579797085, "learning_rate": 6.847466259230965e-05, "loss": 1.8028, "step": 1241 }, { "epoch": 0.32, "grad_norm": 0.2752105042682146, "learning_rate": 6.844919786096257e-05, "loss": 1.9422, "step": 1242 }, { "epoch": 0.32, "grad_norm": 0.2793974395248586, "learning_rate": 6.842373312961549e-05, "loss": 1.9473, "step": 1243 }, { "epoch": 0.32, "grad_norm": 0.291715101353079, "learning_rate": 6.83982683982684e-05, "loss": 2.0187, "step": 1244 }, { "epoch": 0.32, "grad_norm": 0.25682453666882127, "learning_rate": 6.837280366692132e-05, "loss": 2.0103, "step": 1245 }, { "epoch": 0.32, "grad_norm": 0.26515443577592535, "learning_rate": 6.834733893557423e-05, "loss": 1.7907, "step": 1246 }, { "epoch": 0.32, "grad_norm": 0.25147888736953206, "learning_rate": 6.832187420422714e-05, "loss": 1.863, "step": 1247 }, { "epoch": 0.32, "grad_norm": 0.2366004509763539, "learning_rate": 6.829640947288007e-05, "loss": 1.8877, "step": 1248 }, { "epoch": 0.32, "grad_norm": 0.22844394961748615, "learning_rate": 6.827094474153298e-05, "loss": 1.8736, "step": 1249 }, { "epoch": 0.32, "grad_norm": 0.2543467149075416, "learning_rate": 6.824548001018589e-05, "loss": 2.0736, "step": 1250 }, { "epoch": 0.32, "grad_norm": 0.25818288091804553, "learning_rate": 6.822001527883882e-05, "loss": 1.8744, "step": 1251 }, { "epoch": 0.32, "grad_norm": 0.23945688023819678, "learning_rate": 6.819455054749173e-05, "loss": 1.9383, "step": 1252 }, { "epoch": 0.32, "grad_norm": 0.2378999128861847, "learning_rate": 6.816908581614464e-05, "loss": 1.8548, "step": 1253 }, { "epoch": 0.32, "grad_norm": 0.21585408977785395, "learning_rate": 6.814362108479757e-05, "loss": 1.7908, "step": 1254 }, { "epoch": 0.32, "grad_norm": 0.2908614089243041, "learning_rate": 6.811815635345048e-05, "loss": 2.259, "step": 1255 }, { "epoch": 0.32, "grad_norm": 0.25735586359063733, "learning_rate": 6.809269162210339e-05, "loss": 2.001, "step": 1256 }, { "epoch": 0.32, "grad_norm": 0.29593359861047985, "learning_rate": 6.80672268907563e-05, "loss": 2.1095, "step": 1257 }, { "epoch": 0.32, "grad_norm": 0.2446240179064592, "learning_rate": 6.804176215940921e-05, "loss": 1.9325, "step": 1258 }, { "epoch": 0.32, "grad_norm": 0.2967514609234207, "learning_rate": 6.801629742806214e-05, "loss": 1.9994, "step": 1259 }, { "epoch": 0.32, "grad_norm": 0.25424482806186377, "learning_rate": 6.799083269671505e-05, "loss": 1.8767, "step": 1260 }, { "epoch": 0.32, "grad_norm": 0.21837488436682204, "learning_rate": 6.796536796536796e-05, "loss": 1.7035, "step": 1261 }, { "epoch": 0.32, "grad_norm": 0.2392078035990844, "learning_rate": 6.793990323402089e-05, "loss": 1.8737, "step": 1262 }, { "epoch": 0.32, "grad_norm": 0.27081493474137447, "learning_rate": 6.79144385026738e-05, "loss": 1.8451, "step": 1263 }, { "epoch": 0.32, "grad_norm": 0.2154631615481794, "learning_rate": 6.788897377132671e-05, "loss": 1.8449, "step": 1264 }, { "epoch": 0.32, "grad_norm": 0.22887887367874743, "learning_rate": 6.786350903997964e-05, "loss": 1.934, "step": 1265 }, { "epoch": 0.32, "grad_norm": 0.2634533435722663, "learning_rate": 6.783804430863254e-05, "loss": 1.7119, "step": 1266 }, { "epoch": 0.32, "grad_norm": 0.23048658625232663, "learning_rate": 6.781257957728546e-05, "loss": 1.923, "step": 1267 }, { "epoch": 0.32, "grad_norm": 0.2676248022423503, "learning_rate": 6.778711484593838e-05, "loss": 2.1183, "step": 1268 }, { "epoch": 0.32, "grad_norm": 0.24552379182876227, "learning_rate": 6.776165011459129e-05, "loss": 2.1152, "step": 1269 }, { "epoch": 0.32, "grad_norm": 0.24533812000343835, "learning_rate": 6.773618538324421e-05, "loss": 2.067, "step": 1270 }, { "epoch": 0.32, "grad_norm": 0.22815212454290484, "learning_rate": 6.771072065189713e-05, "loss": 1.6645, "step": 1271 }, { "epoch": 0.32, "grad_norm": 0.22504858942695816, "learning_rate": 6.768525592055004e-05, "loss": 1.671, "step": 1272 }, { "epoch": 0.32, "grad_norm": 0.25821809941671164, "learning_rate": 6.765979118920296e-05, "loss": 1.9414, "step": 1273 }, { "epoch": 0.32, "grad_norm": 0.2580070628430365, "learning_rate": 6.763432645785586e-05, "loss": 2.1002, "step": 1274 }, { "epoch": 0.32, "grad_norm": 0.2534860067596615, "learning_rate": 6.760886172650879e-05, "loss": 1.7564, "step": 1275 }, { "epoch": 0.32, "grad_norm": 0.29466220969681545, "learning_rate": 6.758339699516171e-05, "loss": 2.1683, "step": 1276 }, { "epoch": 0.33, "grad_norm": 0.24849163941014, "learning_rate": 6.755793226381461e-05, "loss": 1.9333, "step": 1277 }, { "epoch": 0.33, "grad_norm": 0.2715774897881854, "learning_rate": 6.753246753246754e-05, "loss": 2.1641, "step": 1278 }, { "epoch": 0.33, "grad_norm": 0.2773423127687851, "learning_rate": 6.750700280112045e-05, "loss": 2.1609, "step": 1279 }, { "epoch": 0.33, "grad_norm": 0.18887337730702833, "learning_rate": 6.748153806977336e-05, "loss": 1.4509, "step": 1280 }, { "epoch": 0.33, "grad_norm": 0.24272226698149396, "learning_rate": 6.745607333842629e-05, "loss": 2.0887, "step": 1281 }, { "epoch": 0.33, "grad_norm": 0.2505609209948652, "learning_rate": 6.74306086070792e-05, "loss": 1.7977, "step": 1282 }, { "epoch": 0.33, "grad_norm": 0.2514602068130961, "learning_rate": 6.740514387573211e-05, "loss": 2.1257, "step": 1283 }, { "epoch": 0.33, "grad_norm": 0.25730321172276827, "learning_rate": 6.737967914438504e-05, "loss": 1.7093, "step": 1284 }, { "epoch": 0.33, "grad_norm": 0.27653387859111245, "learning_rate": 6.735421441303794e-05, "loss": 1.8013, "step": 1285 }, { "epoch": 0.33, "grad_norm": 0.25129774152467227, "learning_rate": 6.732874968169086e-05, "loss": 2.0237, "step": 1286 }, { "epoch": 0.33, "grad_norm": 0.309722381690592, "learning_rate": 6.730328495034379e-05, "loss": 2.0883, "step": 1287 }, { "epoch": 0.33, "grad_norm": 0.23143122022516766, "learning_rate": 6.727782021899669e-05, "loss": 1.6351, "step": 1288 }, { "epoch": 0.33, "grad_norm": 0.20390221566195668, "learning_rate": 6.725235548764961e-05, "loss": 1.5702, "step": 1289 }, { "epoch": 0.33, "grad_norm": 0.22749916840233614, "learning_rate": 6.722689075630254e-05, "loss": 1.9343, "step": 1290 }, { "epoch": 0.33, "grad_norm": 0.33009673523214367, "learning_rate": 6.720142602495543e-05, "loss": 1.6855, "step": 1291 }, { "epoch": 0.33, "grad_norm": 0.27403768337901185, "learning_rate": 6.717596129360836e-05, "loss": 1.8071, "step": 1292 }, { "epoch": 0.33, "grad_norm": 0.2502055366326991, "learning_rate": 6.715049656226127e-05, "loss": 1.7944, "step": 1293 }, { "epoch": 0.33, "grad_norm": 0.23232143700286442, "learning_rate": 6.712503183091418e-05, "loss": 1.822, "step": 1294 }, { "epoch": 0.33, "grad_norm": 0.2513763568335079, "learning_rate": 6.709956709956711e-05, "loss": 1.8895, "step": 1295 }, { "epoch": 0.33, "grad_norm": 0.23929699946645303, "learning_rate": 6.707410236822001e-05, "loss": 1.8651, "step": 1296 }, { "epoch": 0.33, "grad_norm": 0.2972379523692023, "learning_rate": 6.704863763687293e-05, "loss": 1.9477, "step": 1297 }, { "epoch": 0.33, "grad_norm": 0.24284732826570454, "learning_rate": 6.702317290552586e-05, "loss": 1.9075, "step": 1298 }, { "epoch": 0.33, "grad_norm": 0.23334467440246193, "learning_rate": 6.699770817417876e-05, "loss": 1.7954, "step": 1299 }, { "epoch": 0.33, "grad_norm": 0.2592609350037409, "learning_rate": 6.697224344283168e-05, "loss": 1.7029, "step": 1300 }, { "epoch": 0.33, "grad_norm": 0.25435686473262703, "learning_rate": 6.69467787114846e-05, "loss": 2.005, "step": 1301 }, { "epoch": 0.33, "grad_norm": 0.27128846489090047, "learning_rate": 6.692131398013751e-05, "loss": 1.9411, "step": 1302 }, { "epoch": 0.33, "grad_norm": 0.27775912006952264, "learning_rate": 6.689584924879043e-05, "loss": 1.741, "step": 1303 }, { "epoch": 0.33, "grad_norm": 0.3025675036838691, "learning_rate": 6.687038451744335e-05, "loss": 1.8248, "step": 1304 }, { "epoch": 0.33, "grad_norm": 0.30757336985638156, "learning_rate": 6.684491978609626e-05, "loss": 2.0837, "step": 1305 }, { "epoch": 0.33, "grad_norm": 0.24105919764034295, "learning_rate": 6.681945505474918e-05, "loss": 1.8052, "step": 1306 }, { "epoch": 0.33, "grad_norm": 0.22409585216388958, "learning_rate": 6.67939903234021e-05, "loss": 1.6513, "step": 1307 }, { "epoch": 0.33, "grad_norm": 0.2285779721231332, "learning_rate": 6.676852559205501e-05, "loss": 1.7821, "step": 1308 }, { "epoch": 0.33, "grad_norm": 0.26111613115485566, "learning_rate": 6.674306086070792e-05, "loss": 1.9924, "step": 1309 }, { "epoch": 0.33, "grad_norm": 0.24233445969365314, "learning_rate": 6.671759612936083e-05, "loss": 1.7896, "step": 1310 }, { "epoch": 0.33, "grad_norm": 0.26812528092251803, "learning_rate": 6.669213139801376e-05, "loss": 1.8401, "step": 1311 }, { "epoch": 0.33, "grad_norm": 0.26731033568969936, "learning_rate": 6.666666666666667e-05, "loss": 1.8132, "step": 1312 }, { "epoch": 0.33, "grad_norm": 0.27340666733340496, "learning_rate": 6.664120193531958e-05, "loss": 1.9519, "step": 1313 }, { "epoch": 0.33, "grad_norm": 0.25663399825692723, "learning_rate": 6.661573720397251e-05, "loss": 1.8158, "step": 1314 }, { "epoch": 0.33, "grad_norm": 0.24286383359922728, "learning_rate": 6.659027247262542e-05, "loss": 1.734, "step": 1315 }, { "epoch": 0.33, "grad_norm": 0.22275185623797666, "learning_rate": 6.656480774127833e-05, "loss": 1.8296, "step": 1316 }, { "epoch": 0.34, "grad_norm": 0.22560261586284144, "learning_rate": 6.653934300993124e-05, "loss": 1.8353, "step": 1317 }, { "epoch": 0.34, "grad_norm": 0.2689726888315691, "learning_rate": 6.651387827858417e-05, "loss": 1.7082, "step": 1318 }, { "epoch": 0.34, "grad_norm": 0.2559405918656532, "learning_rate": 6.648841354723708e-05, "loss": 1.7235, "step": 1319 }, { "epoch": 0.34, "grad_norm": 0.29269610115819655, "learning_rate": 6.646294881589e-05, "loss": 1.969, "step": 1320 }, { "epoch": 0.34, "grad_norm": 0.2800083701338728, "learning_rate": 6.64374840845429e-05, "loss": 2.0688, "step": 1321 }, { "epoch": 0.34, "grad_norm": 0.2775607401940869, "learning_rate": 6.641201935319583e-05, "loss": 1.9894, "step": 1322 }, { "epoch": 0.34, "grad_norm": 0.28656169009328225, "learning_rate": 6.638655462184874e-05, "loss": 1.9262, "step": 1323 }, { "epoch": 0.34, "grad_norm": 0.22717866475058626, "learning_rate": 6.636108989050166e-05, "loss": 1.8858, "step": 1324 }, { "epoch": 0.34, "grad_norm": 0.22732265151658854, "learning_rate": 6.633562515915458e-05, "loss": 1.783, "step": 1325 }, { "epoch": 0.34, "grad_norm": 0.27420903756528914, "learning_rate": 6.631016042780749e-05, "loss": 2.0469, "step": 1326 }, { "epoch": 0.34, "grad_norm": 0.259177481462969, "learning_rate": 6.62846956964604e-05, "loss": 2.0167, "step": 1327 }, { "epoch": 0.34, "grad_norm": 0.31581149304212625, "learning_rate": 6.625923096511332e-05, "loss": 1.9306, "step": 1328 }, { "epoch": 0.34, "grad_norm": 0.2304666405920705, "learning_rate": 6.623376623376624e-05, "loss": 1.9596, "step": 1329 }, { "epoch": 0.34, "grad_norm": 0.28666622969728994, "learning_rate": 6.620830150241916e-05, "loss": 2.1321, "step": 1330 }, { "epoch": 0.34, "grad_norm": 0.22961063980423926, "learning_rate": 6.618283677107207e-05, "loss": 1.7907, "step": 1331 }, { "epoch": 0.34, "grad_norm": 0.2070538940282064, "learning_rate": 6.615737203972498e-05, "loss": 1.6409, "step": 1332 }, { "epoch": 0.34, "grad_norm": 0.24414366910943996, "learning_rate": 6.61319073083779e-05, "loss": 1.9651, "step": 1333 }, { "epoch": 0.34, "grad_norm": 0.22810236740192402, "learning_rate": 6.610644257703082e-05, "loss": 1.5453, "step": 1334 }, { "epoch": 0.34, "grad_norm": 0.2658117843706197, "learning_rate": 6.608097784568373e-05, "loss": 1.9749, "step": 1335 }, { "epoch": 0.34, "grad_norm": 0.2468581966524379, "learning_rate": 6.605551311433664e-05, "loss": 2.0176, "step": 1336 }, { "epoch": 0.34, "grad_norm": 0.2165745634927306, "learning_rate": 6.603004838298957e-05, "loss": 1.8768, "step": 1337 }, { "epoch": 0.34, "grad_norm": 0.35145263014693984, "learning_rate": 6.600458365164248e-05, "loss": 2.1447, "step": 1338 }, { "epoch": 0.34, "grad_norm": 0.2898264532763385, "learning_rate": 6.597911892029539e-05, "loss": 2.0813, "step": 1339 }, { "epoch": 0.34, "grad_norm": 0.25197292220531337, "learning_rate": 6.595365418894832e-05, "loss": 1.7399, "step": 1340 }, { "epoch": 0.34, "grad_norm": 0.24448733316268856, "learning_rate": 6.592818945760123e-05, "loss": 1.9932, "step": 1341 }, { "epoch": 0.34, "grad_norm": 0.24476118946409367, "learning_rate": 6.590272472625414e-05, "loss": 1.9617, "step": 1342 }, { "epoch": 0.34, "grad_norm": 0.2640236803446789, "learning_rate": 6.587725999490705e-05, "loss": 2.2631, "step": 1343 }, { "epoch": 0.34, "grad_norm": 0.2591299870319825, "learning_rate": 6.585179526355996e-05, "loss": 1.9671, "step": 1344 }, { "epoch": 0.34, "grad_norm": 0.2675207034682249, "learning_rate": 6.582633053221289e-05, "loss": 1.693, "step": 1345 }, { "epoch": 0.34, "grad_norm": 0.24760865376865063, "learning_rate": 6.58008658008658e-05, "loss": 1.9212, "step": 1346 }, { "epoch": 0.34, "grad_norm": 0.21617154845504996, "learning_rate": 6.577540106951871e-05, "loss": 1.7881, "step": 1347 }, { "epoch": 0.34, "grad_norm": 0.2375275072054588, "learning_rate": 6.574993633817164e-05, "loss": 1.7963, "step": 1348 }, { "epoch": 0.34, "grad_norm": 0.2585299940545612, "learning_rate": 6.572447160682455e-05, "loss": 1.8754, "step": 1349 }, { "epoch": 0.34, "grad_norm": 0.25376249792299027, "learning_rate": 6.569900687547746e-05, "loss": 1.8009, "step": 1350 }, { "epoch": 0.34, "grad_norm": 0.25025286403430946, "learning_rate": 6.567354214413039e-05, "loss": 1.9266, "step": 1351 }, { "epoch": 0.34, "grad_norm": 0.25199458054739865, "learning_rate": 6.564807741278329e-05, "loss": 1.9279, "step": 1352 }, { "epoch": 0.34, "grad_norm": 0.27266634783040256, "learning_rate": 6.562261268143621e-05, "loss": 1.9831, "step": 1353 }, { "epoch": 0.34, "grad_norm": 0.27669504351658974, "learning_rate": 6.559714795008913e-05, "loss": 1.9848, "step": 1354 }, { "epoch": 0.34, "grad_norm": 0.24953412069247122, "learning_rate": 6.557168321874204e-05, "loss": 1.8536, "step": 1355 }, { "epoch": 0.35, "grad_norm": 0.2702238885808487, "learning_rate": 6.554621848739496e-05, "loss": 2.0665, "step": 1356 }, { "epoch": 0.35, "grad_norm": 0.2321969715255644, "learning_rate": 6.552075375604788e-05, "loss": 1.9268, "step": 1357 }, { "epoch": 0.35, "grad_norm": 0.2798104799004717, "learning_rate": 6.549528902470079e-05, "loss": 1.8838, "step": 1358 }, { "epoch": 0.35, "grad_norm": 0.21752493840056586, "learning_rate": 6.546982429335371e-05, "loss": 1.6784, "step": 1359 }, { "epoch": 0.35, "grad_norm": 0.2513490132339607, "learning_rate": 6.544435956200661e-05, "loss": 1.8271, "step": 1360 }, { "epoch": 0.35, "grad_norm": 0.2365064814921405, "learning_rate": 6.541889483065954e-05, "loss": 2.0017, "step": 1361 }, { "epoch": 0.35, "grad_norm": 0.2366556782309439, "learning_rate": 6.539343009931246e-05, "loss": 1.559, "step": 1362 }, { "epoch": 0.35, "grad_norm": 0.2503095885499184, "learning_rate": 6.536796536796536e-05, "loss": 2.0942, "step": 1363 }, { "epoch": 0.35, "grad_norm": 0.23861207821690975, "learning_rate": 6.534250063661829e-05, "loss": 1.9378, "step": 1364 }, { "epoch": 0.35, "grad_norm": 0.21152157628586954, "learning_rate": 6.531703590527121e-05, "loss": 1.8343, "step": 1365 }, { "epoch": 0.35, "grad_norm": 0.23847317392502101, "learning_rate": 6.529157117392411e-05, "loss": 1.7983, "step": 1366 }, { "epoch": 0.35, "grad_norm": 0.25079873757873594, "learning_rate": 6.526610644257704e-05, "loss": 1.7489, "step": 1367 }, { "epoch": 0.35, "grad_norm": 0.2991405549227378, "learning_rate": 6.524064171122995e-05, "loss": 2.134, "step": 1368 }, { "epoch": 0.35, "grad_norm": 0.20564232373526892, "learning_rate": 6.521517697988286e-05, "loss": 1.6942, "step": 1369 }, { "epoch": 0.35, "grad_norm": 0.25851456100968884, "learning_rate": 6.518971224853579e-05, "loss": 1.7756, "step": 1370 }, { "epoch": 0.35, "grad_norm": 0.3344556149784563, "learning_rate": 6.516424751718869e-05, "loss": 1.8869, "step": 1371 }, { "epoch": 0.35, "grad_norm": 0.22107715477010328, "learning_rate": 6.513878278584161e-05, "loss": 1.7634, "step": 1372 }, { "epoch": 0.35, "grad_norm": 0.2764487533710954, "learning_rate": 6.511331805449454e-05, "loss": 1.7338, "step": 1373 }, { "epoch": 0.35, "grad_norm": 0.29681104673896164, "learning_rate": 6.508785332314744e-05, "loss": 1.8667, "step": 1374 }, { "epoch": 0.35, "grad_norm": 0.2723612722690003, "learning_rate": 6.506238859180036e-05, "loss": 2.0227, "step": 1375 }, { "epoch": 0.35, "grad_norm": 0.23207059327593188, "learning_rate": 6.503692386045329e-05, "loss": 1.8282, "step": 1376 }, { "epoch": 0.35, "grad_norm": 0.29552727261424494, "learning_rate": 6.501145912910619e-05, "loss": 1.9478, "step": 1377 }, { "epoch": 0.35, "grad_norm": 0.24430187240969053, "learning_rate": 6.498599439775911e-05, "loss": 1.6939, "step": 1378 }, { "epoch": 0.35, "grad_norm": 0.23147228743696965, "learning_rate": 6.496052966641202e-05, "loss": 2.0028, "step": 1379 }, { "epoch": 0.35, "grad_norm": 0.2138479563806518, "learning_rate": 6.493506493506494e-05, "loss": 1.8769, "step": 1380 }, { "epoch": 0.35, "grad_norm": 0.23960372675608838, "learning_rate": 6.490960020371786e-05, "loss": 1.7108, "step": 1381 }, { "epoch": 0.35, "grad_norm": 0.266942808416875, "learning_rate": 6.488413547237077e-05, "loss": 1.8276, "step": 1382 }, { "epoch": 0.35, "grad_norm": 0.23638051610892089, "learning_rate": 6.485867074102368e-05, "loss": 1.737, "step": 1383 }, { "epoch": 0.35, "grad_norm": 0.24774945818584873, "learning_rate": 6.483320600967661e-05, "loss": 1.9648, "step": 1384 }, { "epoch": 0.35, "grad_norm": 0.24892074861224348, "learning_rate": 6.480774127832951e-05, "loss": 1.7782, "step": 1385 }, { "epoch": 0.35, "grad_norm": 0.23232209443976776, "learning_rate": 6.478227654698243e-05, "loss": 1.6269, "step": 1386 }, { "epoch": 0.35, "grad_norm": 0.2334740009649208, "learning_rate": 6.475681181563535e-05, "loss": 1.8128, "step": 1387 }, { "epoch": 0.35, "grad_norm": 0.2576680645798872, "learning_rate": 6.473134708428826e-05, "loss": 2.1066, "step": 1388 }, { "epoch": 0.35, "grad_norm": 0.2638698682218262, "learning_rate": 6.470588235294118e-05, "loss": 1.7516, "step": 1389 }, { "epoch": 0.35, "grad_norm": 0.27446292971529473, "learning_rate": 6.46804176215941e-05, "loss": 1.9318, "step": 1390 }, { "epoch": 0.35, "grad_norm": 0.24366698287989227, "learning_rate": 6.465495289024701e-05, "loss": 2.0153, "step": 1391 }, { "epoch": 0.35, "grad_norm": 0.25689162894301576, "learning_rate": 6.462948815889993e-05, "loss": 1.9566, "step": 1392 }, { "epoch": 0.35, "grad_norm": 0.2711576100748685, "learning_rate": 6.460402342755285e-05, "loss": 1.6647, "step": 1393 }, { "epoch": 0.35, "grad_norm": 0.2581664314686578, "learning_rate": 6.457855869620576e-05, "loss": 1.8281, "step": 1394 }, { "epoch": 0.36, "grad_norm": 0.23620730062151213, "learning_rate": 6.455309396485867e-05, "loss": 1.7709, "step": 1395 }, { "epoch": 0.36, "grad_norm": 0.27636056898839073, "learning_rate": 6.452762923351158e-05, "loss": 1.8426, "step": 1396 }, { "epoch": 0.36, "grad_norm": 0.21443328661435834, "learning_rate": 6.450216450216451e-05, "loss": 1.6392, "step": 1397 }, { "epoch": 0.36, "grad_norm": 0.2087630945370244, "learning_rate": 6.447669977081742e-05, "loss": 1.6107, "step": 1398 }, { "epoch": 0.36, "grad_norm": 0.2534485726812814, "learning_rate": 6.445123503947033e-05, "loss": 1.7775, "step": 1399 }, { "epoch": 0.36, "grad_norm": 0.3154169558385601, "learning_rate": 6.442577030812326e-05, "loss": 2.1625, "step": 1400 }, { "epoch": 0.36, "grad_norm": 0.27588868250446313, "learning_rate": 6.440030557677617e-05, "loss": 1.7711, "step": 1401 }, { "epoch": 0.36, "grad_norm": 0.2575743471145686, "learning_rate": 6.437484084542908e-05, "loss": 1.7165, "step": 1402 }, { "epoch": 0.36, "grad_norm": 0.32638933162569, "learning_rate": 6.434937611408201e-05, "loss": 2.0342, "step": 1403 }, { "epoch": 0.36, "grad_norm": 0.215914382111131, "learning_rate": 6.432391138273492e-05, "loss": 1.6111, "step": 1404 }, { "epoch": 0.36, "grad_norm": 0.2793878260490901, "learning_rate": 6.429844665138783e-05, "loss": 1.8474, "step": 1405 }, { "epoch": 0.36, "grad_norm": 0.2793958928576041, "learning_rate": 6.427298192004074e-05, "loss": 1.9646, "step": 1406 }, { "epoch": 0.36, "grad_norm": 0.30662599759536363, "learning_rate": 6.424751718869366e-05, "loss": 1.9238, "step": 1407 }, { "epoch": 0.36, "grad_norm": 0.2886978246152014, "learning_rate": 6.422205245734658e-05, "loss": 2.0002, "step": 1408 }, { "epoch": 0.36, "grad_norm": 0.2338112886678382, "learning_rate": 6.41965877259995e-05, "loss": 1.8845, "step": 1409 }, { "epoch": 0.36, "grad_norm": 0.25610246830845196, "learning_rate": 6.41711229946524e-05, "loss": 2.0396, "step": 1410 }, { "epoch": 0.36, "grad_norm": 0.2975151413382255, "learning_rate": 6.414565826330533e-05, "loss": 1.8425, "step": 1411 }, { "epoch": 0.36, "grad_norm": 0.24863671586657765, "learning_rate": 6.412019353195824e-05, "loss": 1.8178, "step": 1412 }, { "epoch": 0.36, "grad_norm": 0.25070527551001476, "learning_rate": 6.409472880061116e-05, "loss": 1.8697, "step": 1413 }, { "epoch": 0.36, "grad_norm": 0.3037780491315251, "learning_rate": 6.406926406926407e-05, "loss": 2.1406, "step": 1414 }, { "epoch": 0.36, "grad_norm": 0.21667416854550103, "learning_rate": 6.404379933791699e-05, "loss": 1.9225, "step": 1415 }, { "epoch": 0.36, "grad_norm": 0.23434381277488175, "learning_rate": 6.40183346065699e-05, "loss": 2.0665, "step": 1416 }, { "epoch": 0.36, "grad_norm": 0.27467134571420476, "learning_rate": 6.399286987522282e-05, "loss": 1.9336, "step": 1417 }, { "epoch": 0.36, "grad_norm": 0.2859665709971017, "learning_rate": 6.396740514387573e-05, "loss": 2.042, "step": 1418 }, { "epoch": 0.36, "grad_norm": 0.2540574137329875, "learning_rate": 6.394194041252866e-05, "loss": 1.8025, "step": 1419 }, { "epoch": 0.36, "grad_norm": 0.26581656336330206, "learning_rate": 6.391647568118157e-05, "loss": 1.6173, "step": 1420 }, { "epoch": 0.36, "grad_norm": 0.2626761248646801, "learning_rate": 6.389101094983448e-05, "loss": 2.1385, "step": 1421 }, { "epoch": 0.36, "grad_norm": 0.23861266367853373, "learning_rate": 6.386554621848739e-05, "loss": 1.7302, "step": 1422 }, { "epoch": 0.36, "grad_norm": 0.2662721632074192, "learning_rate": 6.384008148714032e-05, "loss": 2.0314, "step": 1423 }, { "epoch": 0.36, "grad_norm": 0.24905956978551994, "learning_rate": 6.381461675579323e-05, "loss": 1.8828, "step": 1424 }, { "epoch": 0.36, "grad_norm": 0.24006534460468518, "learning_rate": 6.378915202444614e-05, "loss": 1.6141, "step": 1425 }, { "epoch": 0.36, "grad_norm": 0.25605440350712466, "learning_rate": 6.376368729309907e-05, "loss": 1.8383, "step": 1426 }, { "epoch": 0.36, "grad_norm": 0.2328022400944262, "learning_rate": 6.373822256175198e-05, "loss": 1.7449, "step": 1427 }, { "epoch": 0.36, "grad_norm": 0.27644689374328363, "learning_rate": 6.371275783040489e-05, "loss": 2.0059, "step": 1428 }, { "epoch": 0.36, "grad_norm": 0.25915403720670466, "learning_rate": 6.368729309905782e-05, "loss": 1.8923, "step": 1429 }, { "epoch": 0.36, "grad_norm": 0.2270547517818991, "learning_rate": 6.366182836771072e-05, "loss": 1.7948, "step": 1430 }, { "epoch": 0.36, "grad_norm": 0.2355183054234252, "learning_rate": 6.363636363636364e-05, "loss": 1.9153, "step": 1431 }, { "epoch": 0.36, "grad_norm": 0.22841443160614022, "learning_rate": 6.361089890501655e-05, "loss": 1.7276, "step": 1432 }, { "epoch": 0.36, "grad_norm": 0.23286847875282332, "learning_rate": 6.358543417366946e-05, "loss": 1.9285, "step": 1433 }, { "epoch": 0.36, "grad_norm": 0.24938555549328892, "learning_rate": 6.355996944232239e-05, "loss": 1.7281, "step": 1434 }, { "epoch": 0.37, "grad_norm": 0.2657884767907746, "learning_rate": 6.35345047109753e-05, "loss": 1.9137, "step": 1435 }, { "epoch": 0.37, "grad_norm": 0.2920189657020723, "learning_rate": 6.350903997962821e-05, "loss": 2.0625, "step": 1436 }, { "epoch": 0.37, "grad_norm": 0.2713019904179645, "learning_rate": 6.348357524828114e-05, "loss": 1.9509, "step": 1437 }, { "epoch": 0.37, "grad_norm": 0.2445235491871537, "learning_rate": 6.345811051693404e-05, "loss": 1.8115, "step": 1438 }, { "epoch": 0.37, "grad_norm": 0.2702311812446719, "learning_rate": 6.343264578558696e-05, "loss": 1.9981, "step": 1439 }, { "epoch": 0.37, "grad_norm": 0.22968169159584015, "learning_rate": 6.340718105423989e-05, "loss": 1.8995, "step": 1440 }, { "epoch": 0.37, "grad_norm": 0.22471574669439587, "learning_rate": 6.338171632289279e-05, "loss": 1.7872, "step": 1441 }, { "epoch": 0.37, "grad_norm": 0.23636276211974966, "learning_rate": 6.335625159154571e-05, "loss": 1.6673, "step": 1442 }, { "epoch": 0.37, "grad_norm": 0.24773997747702176, "learning_rate": 6.333078686019863e-05, "loss": 1.8048, "step": 1443 }, { "epoch": 0.37, "grad_norm": 0.20969578168066197, "learning_rate": 6.330532212885154e-05, "loss": 1.5605, "step": 1444 }, { "epoch": 0.37, "grad_norm": 0.26764932784946216, "learning_rate": 6.327985739750446e-05, "loss": 1.7746, "step": 1445 }, { "epoch": 0.37, "grad_norm": 0.23415979996101344, "learning_rate": 6.325439266615738e-05, "loss": 1.9106, "step": 1446 }, { "epoch": 0.37, "grad_norm": 0.2819685499073677, "learning_rate": 6.322892793481029e-05, "loss": 1.9338, "step": 1447 }, { "epoch": 0.37, "grad_norm": 0.3109154703013635, "learning_rate": 6.320346320346321e-05, "loss": 2.104, "step": 1448 }, { "epoch": 0.37, "grad_norm": 0.25161196654742346, "learning_rate": 6.317799847211611e-05, "loss": 1.9693, "step": 1449 }, { "epoch": 0.37, "grad_norm": 0.29015773053836613, "learning_rate": 6.315253374076904e-05, "loss": 1.7219, "step": 1450 }, { "epoch": 0.37, "grad_norm": 0.2714891776339336, "learning_rate": 6.312706900942196e-05, "loss": 2.1159, "step": 1451 }, { "epoch": 0.37, "grad_norm": 0.2761804603634261, "learning_rate": 6.310160427807486e-05, "loss": 1.9664, "step": 1452 }, { "epoch": 0.37, "grad_norm": 0.2757466497139924, "learning_rate": 6.307613954672779e-05, "loss": 1.921, "step": 1453 }, { "epoch": 0.37, "grad_norm": 0.2914477602563562, "learning_rate": 6.30506748153807e-05, "loss": 1.8062, "step": 1454 }, { "epoch": 0.37, "grad_norm": 0.2544853795063896, "learning_rate": 6.302521008403361e-05, "loss": 1.8595, "step": 1455 }, { "epoch": 0.37, "grad_norm": 0.22936712875165363, "learning_rate": 6.299974535268654e-05, "loss": 1.5139, "step": 1456 }, { "epoch": 0.37, "grad_norm": 0.24532316300124118, "learning_rate": 6.297428062133945e-05, "loss": 1.9526, "step": 1457 }, { "epoch": 0.37, "grad_norm": 0.3077624749909794, "learning_rate": 6.294881588999236e-05, "loss": 2.0753, "step": 1458 }, { "epoch": 0.37, "grad_norm": 0.26036970986766894, "learning_rate": 6.292335115864529e-05, "loss": 2.0096, "step": 1459 }, { "epoch": 0.37, "grad_norm": 0.2603575051181066, "learning_rate": 6.289788642729819e-05, "loss": 1.7578, "step": 1460 }, { "epoch": 0.37, "grad_norm": 0.2399714977754378, "learning_rate": 6.287242169595111e-05, "loss": 1.878, "step": 1461 }, { "epoch": 0.37, "grad_norm": 0.2849326236664881, "learning_rate": 6.284695696460404e-05, "loss": 1.814, "step": 1462 }, { "epoch": 0.37, "grad_norm": 0.23877184033983795, "learning_rate": 6.282149223325694e-05, "loss": 1.9115, "step": 1463 }, { "epoch": 0.37, "grad_norm": 0.20668147788374516, "learning_rate": 6.279602750190986e-05, "loss": 1.6837, "step": 1464 }, { "epoch": 0.37, "grad_norm": 0.25040498356110313, "learning_rate": 6.277056277056277e-05, "loss": 1.9307, "step": 1465 }, { "epoch": 0.37, "grad_norm": 0.2542878584427889, "learning_rate": 6.274509803921569e-05, "loss": 2.0858, "step": 1466 }, { "epoch": 0.37, "grad_norm": 0.26215346373467596, "learning_rate": 6.271963330786861e-05, "loss": 2.105, "step": 1467 }, { "epoch": 0.37, "grad_norm": 0.21783654192462465, "learning_rate": 6.269416857652152e-05, "loss": 1.6093, "step": 1468 }, { "epoch": 0.37, "grad_norm": 0.3033084471593143, "learning_rate": 6.266870384517444e-05, "loss": 2.3007, "step": 1469 }, { "epoch": 0.37, "grad_norm": 0.2241801431361778, "learning_rate": 6.264323911382736e-05, "loss": 1.8179, "step": 1470 }, { "epoch": 0.37, "grad_norm": 0.20758916247638143, "learning_rate": 6.261777438248026e-05, "loss": 1.7694, "step": 1471 }, { "epoch": 0.37, "grad_norm": 0.24665891751691102, "learning_rate": 6.259230965113318e-05, "loss": 1.5666, "step": 1472 }, { "epoch": 0.37, "grad_norm": 0.20837654420638607, "learning_rate": 6.25668449197861e-05, "loss": 1.7046, "step": 1473 }, { "epoch": 0.38, "grad_norm": 0.22807501023098187, "learning_rate": 6.254138018843901e-05, "loss": 1.8228, "step": 1474 }, { "epoch": 0.38, "grad_norm": 0.2185941720158155, "learning_rate": 6.251591545709193e-05, "loss": 1.7609, "step": 1475 }, { "epoch": 0.38, "grad_norm": 0.35387071932188074, "learning_rate": 6.249045072574485e-05, "loss": 2.0526, "step": 1476 }, { "epoch": 0.38, "grad_norm": 0.2554499429668722, "learning_rate": 6.246498599439776e-05, "loss": 1.5325, "step": 1477 }, { "epoch": 0.38, "grad_norm": 0.23743890961682723, "learning_rate": 6.243952126305068e-05, "loss": 1.7867, "step": 1478 }, { "epoch": 0.38, "grad_norm": 0.24947721985606203, "learning_rate": 6.24140565317036e-05, "loss": 1.7074, "step": 1479 }, { "epoch": 0.38, "grad_norm": 0.20516978440742267, "learning_rate": 6.238859180035651e-05, "loss": 1.5472, "step": 1480 }, { "epoch": 0.38, "grad_norm": 0.21535132012508432, "learning_rate": 6.236312706900943e-05, "loss": 1.7486, "step": 1481 }, { "epoch": 0.38, "grad_norm": 0.23653666697894052, "learning_rate": 6.233766233766233e-05, "loss": 1.9899, "step": 1482 }, { "epoch": 0.38, "grad_norm": 0.2437997803840399, "learning_rate": 6.231219760631526e-05, "loss": 1.8934, "step": 1483 }, { "epoch": 0.38, "grad_norm": 0.22951134946649115, "learning_rate": 6.228673287496817e-05, "loss": 1.7781, "step": 1484 }, { "epoch": 0.38, "grad_norm": 0.27430318309600316, "learning_rate": 6.226126814362108e-05, "loss": 1.893, "step": 1485 }, { "epoch": 0.38, "grad_norm": 0.2974475935249174, "learning_rate": 6.223580341227401e-05, "loss": 1.8601, "step": 1486 }, { "epoch": 0.38, "grad_norm": 0.25028619180316575, "learning_rate": 6.221033868092692e-05, "loss": 1.7759, "step": 1487 }, { "epoch": 0.38, "grad_norm": 0.2989686373796846, "learning_rate": 6.218487394957983e-05, "loss": 2.0212, "step": 1488 }, { "epoch": 0.38, "grad_norm": 0.2613443572016678, "learning_rate": 6.215940921823276e-05, "loss": 1.6686, "step": 1489 }, { "epoch": 0.38, "grad_norm": 0.2409582010917778, "learning_rate": 6.213394448688567e-05, "loss": 1.6892, "step": 1490 }, { "epoch": 0.38, "grad_norm": 0.3053278307740795, "learning_rate": 6.210847975553858e-05, "loss": 1.8391, "step": 1491 }, { "epoch": 0.38, "grad_norm": 0.27683640133493354, "learning_rate": 6.20830150241915e-05, "loss": 1.9558, "step": 1492 }, { "epoch": 0.38, "grad_norm": 0.26100866580786997, "learning_rate": 6.205755029284442e-05, "loss": 2.0114, "step": 1493 }, { "epoch": 0.38, "grad_norm": 0.25144925856208833, "learning_rate": 6.203208556149733e-05, "loss": 1.7074, "step": 1494 }, { "epoch": 0.38, "grad_norm": 0.2786330356808251, "learning_rate": 6.200662083015024e-05, "loss": 2.0112, "step": 1495 }, { "epoch": 0.38, "grad_norm": 0.25566249290316717, "learning_rate": 6.198115609880316e-05, "loss": 1.699, "step": 1496 }, { "epoch": 0.38, "grad_norm": 0.23609661553724798, "learning_rate": 6.195569136745608e-05, "loss": 1.9024, "step": 1497 }, { "epoch": 0.38, "grad_norm": 0.315676495156524, "learning_rate": 6.1930226636109e-05, "loss": 2.0619, "step": 1498 }, { "epoch": 0.38, "grad_norm": 0.26074795948740676, "learning_rate": 6.19047619047619e-05, "loss": 1.9099, "step": 1499 }, { "epoch": 0.38, "grad_norm": 0.2535138105142371, "learning_rate": 6.187929717341482e-05, "loss": 1.7467, "step": 1500 }, { "epoch": 0.38, "grad_norm": 0.22753225979438801, "learning_rate": 6.185383244206774e-05, "loss": 1.6906, "step": 1501 }, { "epoch": 0.38, "grad_norm": 0.2415540394035451, "learning_rate": 6.182836771072066e-05, "loss": 2.1264, "step": 1502 }, { "epoch": 0.38, "grad_norm": 0.24931738465143685, "learning_rate": 6.180290297937357e-05, "loss": 1.913, "step": 1503 }, { "epoch": 0.38, "grad_norm": 0.262043395741899, "learning_rate": 6.17774382480265e-05, "loss": 1.9871, "step": 1504 }, { "epoch": 0.38, "grad_norm": 0.24457982802573416, "learning_rate": 6.17519735166794e-05, "loss": 2.0219, "step": 1505 }, { "epoch": 0.38, "grad_norm": 0.20040165571522914, "learning_rate": 6.172650878533232e-05, "loss": 1.7238, "step": 1506 }, { "epoch": 0.38, "grad_norm": 0.2316379067710165, "learning_rate": 6.170104405398523e-05, "loss": 1.6758, "step": 1507 }, { "epoch": 0.38, "grad_norm": 0.25894208018998377, "learning_rate": 6.167557932263814e-05, "loss": 1.6759, "step": 1508 }, { "epoch": 0.38, "grad_norm": 0.25088205302140765, "learning_rate": 6.165011459129107e-05, "loss": 1.9603, "step": 1509 }, { "epoch": 0.38, "grad_norm": 0.25628582890009516, "learning_rate": 6.162464985994398e-05, "loss": 2.0707, "step": 1510 }, { "epoch": 0.38, "grad_norm": 0.26404782090778656, "learning_rate": 6.159918512859689e-05, "loss": 1.956, "step": 1511 }, { "epoch": 0.38, "grad_norm": 0.23154477907500365, "learning_rate": 6.157372039724982e-05, "loss": 1.7662, "step": 1512 }, { "epoch": 0.39, "grad_norm": 0.24666147746258535, "learning_rate": 6.154825566590273e-05, "loss": 1.7528, "step": 1513 }, { "epoch": 0.39, "grad_norm": 0.25819520288915415, "learning_rate": 6.152279093455564e-05, "loss": 1.8696, "step": 1514 }, { "epoch": 0.39, "grad_norm": 0.23331428351437192, "learning_rate": 6.149732620320857e-05, "loss": 1.8525, "step": 1515 }, { "epoch": 0.39, "grad_norm": 0.2463173153529251, "learning_rate": 6.147186147186147e-05, "loss": 1.8842, "step": 1516 }, { "epoch": 0.39, "grad_norm": 0.22932427939527678, "learning_rate": 6.144639674051439e-05, "loss": 1.6951, "step": 1517 }, { "epoch": 0.39, "grad_norm": 0.25462386617656213, "learning_rate": 6.14209320091673e-05, "loss": 1.9397, "step": 1518 }, { "epoch": 0.39, "grad_norm": 0.2704000017182009, "learning_rate": 6.139546727782022e-05, "loss": 2.014, "step": 1519 }, { "epoch": 0.39, "grad_norm": 0.22640643828642856, "learning_rate": 6.137000254647314e-05, "loss": 1.7801, "step": 1520 }, { "epoch": 0.39, "grad_norm": 0.2801141585363283, "learning_rate": 6.134453781512605e-05, "loss": 2.0162, "step": 1521 }, { "epoch": 0.39, "grad_norm": 0.37284208338634134, "learning_rate": 6.131907308377896e-05, "loss": 2.4535, "step": 1522 }, { "epoch": 0.39, "grad_norm": 0.2206019954737265, "learning_rate": 6.129360835243189e-05, "loss": 1.758, "step": 1523 }, { "epoch": 0.39, "grad_norm": 0.24222795087111626, "learning_rate": 6.12681436210848e-05, "loss": 1.9262, "step": 1524 }, { "epoch": 0.39, "grad_norm": 0.2623245998615822, "learning_rate": 6.124267888973771e-05, "loss": 2.1177, "step": 1525 }, { "epoch": 0.39, "grad_norm": 0.2149816677267667, "learning_rate": 6.121721415839064e-05, "loss": 1.8284, "step": 1526 }, { "epoch": 0.39, "grad_norm": 0.25969795550289637, "learning_rate": 6.119174942704354e-05, "loss": 2.0281, "step": 1527 }, { "epoch": 0.39, "grad_norm": 0.22986253115787428, "learning_rate": 6.116628469569646e-05, "loss": 1.7601, "step": 1528 }, { "epoch": 0.39, "grad_norm": 0.26093874891062196, "learning_rate": 6.114081996434938e-05, "loss": 1.9096, "step": 1529 }, { "epoch": 0.39, "grad_norm": 0.2656012692790266, "learning_rate": 6.111535523300229e-05, "loss": 1.9157, "step": 1530 }, { "epoch": 0.39, "grad_norm": 0.22004615429597213, "learning_rate": 6.108989050165521e-05, "loss": 1.7553, "step": 1531 }, { "epoch": 0.39, "grad_norm": 0.2721988415462892, "learning_rate": 6.106442577030813e-05, "loss": 1.8849, "step": 1532 }, { "epoch": 0.39, "grad_norm": 0.25852281801944327, "learning_rate": 6.103896103896104e-05, "loss": 1.8063, "step": 1533 }, { "epoch": 0.39, "grad_norm": 0.24761462659063677, "learning_rate": 6.101349630761396e-05, "loss": 1.8574, "step": 1534 }, { "epoch": 0.39, "grad_norm": 0.2726207797525334, "learning_rate": 6.098803157626687e-05, "loss": 1.7744, "step": 1535 }, { "epoch": 0.39, "grad_norm": 0.31156736919152733, "learning_rate": 6.096256684491979e-05, "loss": 1.692, "step": 1536 }, { "epoch": 0.39, "grad_norm": 0.29589856339757276, "learning_rate": 6.093710211357271e-05, "loss": 2.0424, "step": 1537 }, { "epoch": 0.39, "grad_norm": 0.25697746293815316, "learning_rate": 6.091163738222562e-05, "loss": 2.0985, "step": 1538 }, { "epoch": 0.39, "grad_norm": 0.2519730996190904, "learning_rate": 6.088617265087854e-05, "loss": 1.9054, "step": 1539 }, { "epoch": 0.39, "grad_norm": 0.2178550105568459, "learning_rate": 6.086070791953146e-05, "loss": 1.6298, "step": 1540 }, { "epoch": 0.39, "grad_norm": 0.24681784052612069, "learning_rate": 6.083524318818436e-05, "loss": 1.7629, "step": 1541 }, { "epoch": 0.39, "grad_norm": 0.23825988517410313, "learning_rate": 6.080977845683729e-05, "loss": 1.8602, "step": 1542 }, { "epoch": 0.39, "grad_norm": 0.2615875940351599, "learning_rate": 6.078431372549019e-05, "loss": 1.8776, "step": 1543 }, { "epoch": 0.39, "grad_norm": 0.24157319299390567, "learning_rate": 6.075884899414311e-05, "loss": 1.8031, "step": 1544 }, { "epoch": 0.39, "grad_norm": 0.2521081048574686, "learning_rate": 6.073338426279603e-05, "loss": 1.8563, "step": 1545 }, { "epoch": 0.39, "grad_norm": 0.2536205004287772, "learning_rate": 6.070791953144894e-05, "loss": 1.9038, "step": 1546 }, { "epoch": 0.39, "grad_norm": 0.28644482262857046, "learning_rate": 6.068245480010186e-05, "loss": 1.9221, "step": 1547 }, { "epoch": 0.39, "grad_norm": 0.2645469273778994, "learning_rate": 6.065699006875478e-05, "loss": 1.8193, "step": 1548 }, { "epoch": 0.39, "grad_norm": 0.25973897811628416, "learning_rate": 6.063152533740769e-05, "loss": 1.8508, "step": 1549 }, { "epoch": 0.39, "grad_norm": 0.24847994567745993, "learning_rate": 6.060606060606061e-05, "loss": 1.9156, "step": 1550 }, { "epoch": 0.39, "grad_norm": 0.2641550370317926, "learning_rate": 6.058059587471352e-05, "loss": 1.8653, "step": 1551 }, { "epoch": 0.4, "grad_norm": 0.24168907683701302, "learning_rate": 6.055513114336644e-05, "loss": 1.9855, "step": 1552 }, { "epoch": 0.4, "grad_norm": 0.22317099049165945, "learning_rate": 6.052966641201936e-05, "loss": 1.7703, "step": 1553 }, { "epoch": 0.4, "grad_norm": 0.2616989498334913, "learning_rate": 6.0504201680672267e-05, "loss": 1.8338, "step": 1554 }, { "epoch": 0.4, "grad_norm": 0.26024510295357467, "learning_rate": 6.0478736949325185e-05, "loss": 1.8048, "step": 1555 }, { "epoch": 0.4, "grad_norm": 0.19852627285411298, "learning_rate": 6.0453272217978104e-05, "loss": 1.8041, "step": 1556 }, { "epoch": 0.4, "grad_norm": 0.2628471899781459, "learning_rate": 6.0427807486631016e-05, "loss": 1.9684, "step": 1557 }, { "epoch": 0.4, "grad_norm": 0.22506245236447928, "learning_rate": 6.0402342755283935e-05, "loss": 1.6453, "step": 1558 }, { "epoch": 0.4, "grad_norm": 0.2571811604095075, "learning_rate": 6.0376878023936854e-05, "loss": 1.9955, "step": 1559 }, { "epoch": 0.4, "grad_norm": 0.30268943206907745, "learning_rate": 6.0351413292589766e-05, "loss": 2.0422, "step": 1560 }, { "epoch": 0.4, "grad_norm": 0.3091237791772314, "learning_rate": 6.0325948561242685e-05, "loss": 2.1414, "step": 1561 }, { "epoch": 0.4, "grad_norm": 0.23776443905612016, "learning_rate": 6.030048382989559e-05, "loss": 1.8843, "step": 1562 }, { "epoch": 0.4, "grad_norm": 0.27552521575346095, "learning_rate": 6.0275019098548516e-05, "loss": 1.6124, "step": 1563 }, { "epoch": 0.4, "grad_norm": 0.3014428924388266, "learning_rate": 6.0249554367201435e-05, "loss": 2.197, "step": 1564 }, { "epoch": 0.4, "grad_norm": 0.2699297433194896, "learning_rate": 6.022408963585434e-05, "loss": 1.7632, "step": 1565 }, { "epoch": 0.4, "grad_norm": 0.3797500496036178, "learning_rate": 6.019862490450726e-05, "loss": 2.0411, "step": 1566 }, { "epoch": 0.4, "grad_norm": 0.2750966010786085, "learning_rate": 6.0173160173160184e-05, "loss": 2.1269, "step": 1567 }, { "epoch": 0.4, "grad_norm": 0.23276078334622122, "learning_rate": 6.014769544181309e-05, "loss": 1.7726, "step": 1568 }, { "epoch": 0.4, "grad_norm": 0.2908467180242655, "learning_rate": 6.012223071046601e-05, "loss": 2.187, "step": 1569 }, { "epoch": 0.4, "grad_norm": 0.26803748525452153, "learning_rate": 6.009676597911892e-05, "loss": 1.9399, "step": 1570 }, { "epoch": 0.4, "grad_norm": 0.2333609499014553, "learning_rate": 6.007130124777184e-05, "loss": 1.9887, "step": 1571 }, { "epoch": 0.4, "grad_norm": 0.3065819896837221, "learning_rate": 6.004583651642476e-05, "loss": 1.9615, "step": 1572 }, { "epoch": 0.4, "grad_norm": 0.3629978469109588, "learning_rate": 6.0020371785077664e-05, "loss": 1.8352, "step": 1573 }, { "epoch": 0.4, "grad_norm": 0.27024806803772333, "learning_rate": 5.999490705373059e-05, "loss": 1.7978, "step": 1574 }, { "epoch": 0.4, "grad_norm": 0.3267840733438547, "learning_rate": 5.996944232238351e-05, "loss": 2.4383, "step": 1575 }, { "epoch": 0.4, "grad_norm": 0.24161056279754708, "learning_rate": 5.9943977591036414e-05, "loss": 1.9505, "step": 1576 }, { "epoch": 0.4, "grad_norm": 0.2705996663831965, "learning_rate": 5.991851285968933e-05, "loss": 2.0719, "step": 1577 }, { "epoch": 0.4, "grad_norm": 0.22459827893256726, "learning_rate": 5.9893048128342244e-05, "loss": 1.8081, "step": 1578 }, { "epoch": 0.4, "grad_norm": 0.27083593911360143, "learning_rate": 5.986758339699516e-05, "loss": 1.8431, "step": 1579 }, { "epoch": 0.4, "grad_norm": 0.27101741397604806, "learning_rate": 5.984211866564808e-05, "loss": 1.8037, "step": 1580 }, { "epoch": 0.4, "grad_norm": 0.2545983784616469, "learning_rate": 5.9816653934300994e-05, "loss": 1.9613, "step": 1581 }, { "epoch": 0.4, "grad_norm": 0.2262129059569068, "learning_rate": 5.979118920295391e-05, "loss": 1.8274, "step": 1582 }, { "epoch": 0.4, "grad_norm": 0.2673398230624889, "learning_rate": 5.976572447160683e-05, "loss": 1.9489, "step": 1583 }, { "epoch": 0.4, "grad_norm": 0.20774983809392522, "learning_rate": 5.9740259740259744e-05, "loss": 1.4933, "step": 1584 }, { "epoch": 0.4, "grad_norm": 0.24907516241492358, "learning_rate": 5.971479500891266e-05, "loss": 1.6926, "step": 1585 }, { "epoch": 0.4, "grad_norm": 0.24478498493706932, "learning_rate": 5.968933027756557e-05, "loss": 1.8847, "step": 1586 }, { "epoch": 0.4, "grad_norm": 0.28013622101707153, "learning_rate": 5.966386554621849e-05, "loss": 1.8548, "step": 1587 }, { "epoch": 0.4, "grad_norm": 0.2725771310995344, "learning_rate": 5.9638400814871406e-05, "loss": 2.1802, "step": 1588 }, { "epoch": 0.4, "grad_norm": 0.23403779249588136, "learning_rate": 5.961293608352432e-05, "loss": 1.9072, "step": 1589 }, { "epoch": 0.4, "grad_norm": 0.2146086880756573, "learning_rate": 5.958747135217724e-05, "loss": 1.7345, "step": 1590 }, { "epoch": 0.4, "grad_norm": 0.25779612807106644, "learning_rate": 5.9562006620830156e-05, "loss": 1.8754, "step": 1591 }, { "epoch": 0.41, "grad_norm": 0.24281365452004675, "learning_rate": 5.953654188948307e-05, "loss": 1.8136, "step": 1592 }, { "epoch": 0.41, "grad_norm": 0.23911587923286357, "learning_rate": 5.9511077158135987e-05, "loss": 1.9183, "step": 1593 }, { "epoch": 0.41, "grad_norm": 0.21796275578395394, "learning_rate": 5.948561242678889e-05, "loss": 1.8331, "step": 1594 }, { "epoch": 0.41, "grad_norm": 0.258296021848505, "learning_rate": 5.946014769544182e-05, "loss": 1.7855, "step": 1595 }, { "epoch": 0.41, "grad_norm": 0.23530867094746857, "learning_rate": 5.9434682964094736e-05, "loss": 1.6778, "step": 1596 }, { "epoch": 0.41, "grad_norm": 0.2208216863535817, "learning_rate": 5.940921823274764e-05, "loss": 1.7123, "step": 1597 }, { "epoch": 0.41, "grad_norm": 0.23626065670430846, "learning_rate": 5.938375350140056e-05, "loss": 1.7934, "step": 1598 }, { "epoch": 0.41, "grad_norm": 0.2570483848402682, "learning_rate": 5.9358288770053486e-05, "loss": 1.6343, "step": 1599 }, { "epoch": 0.41, "grad_norm": 0.24978742651698496, "learning_rate": 5.933282403870639e-05, "loss": 1.8905, "step": 1600 }, { "epoch": 0.41, "grad_norm": 0.25463877428295967, "learning_rate": 5.930735930735931e-05, "loss": 2.0177, "step": 1601 }, { "epoch": 0.41, "grad_norm": 0.2477609758943463, "learning_rate": 5.928189457601223e-05, "loss": 1.6774, "step": 1602 }, { "epoch": 0.41, "grad_norm": 0.26060204901142225, "learning_rate": 5.925642984466514e-05, "loss": 1.967, "step": 1603 }, { "epoch": 0.41, "grad_norm": 0.3095171548829742, "learning_rate": 5.923096511331806e-05, "loss": 1.8638, "step": 1604 }, { "epoch": 0.41, "grad_norm": 0.2518297825021641, "learning_rate": 5.9205500381970965e-05, "loss": 1.819, "step": 1605 }, { "epoch": 0.41, "grad_norm": 0.2939462249843084, "learning_rate": 5.918003565062389e-05, "loss": 2.0621, "step": 1606 }, { "epoch": 0.41, "grad_norm": 0.2626615736613039, "learning_rate": 5.915457091927681e-05, "loss": 1.9616, "step": 1607 }, { "epoch": 0.41, "grad_norm": 0.2905264589953374, "learning_rate": 5.9129106187929715e-05, "loss": 1.9486, "step": 1608 }, { "epoch": 0.41, "grad_norm": 0.29541525087613635, "learning_rate": 5.9103641456582634e-05, "loss": 2.1314, "step": 1609 }, { "epoch": 0.41, "grad_norm": 0.24192291080967857, "learning_rate": 5.907817672523556e-05, "loss": 1.7453, "step": 1610 }, { "epoch": 0.41, "grad_norm": 0.30470200039520245, "learning_rate": 5.9052711993888465e-05, "loss": 2.0023, "step": 1611 }, { "epoch": 0.41, "grad_norm": 0.2788043180495265, "learning_rate": 5.9027247262541384e-05, "loss": 1.9452, "step": 1612 }, { "epoch": 0.41, "grad_norm": 0.24726742205938537, "learning_rate": 5.9001782531194296e-05, "loss": 1.8762, "step": 1613 }, { "epoch": 0.41, "grad_norm": 0.32697497651468815, "learning_rate": 5.8976317799847215e-05, "loss": 2.0121, "step": 1614 }, { "epoch": 0.41, "grad_norm": 0.26092398659897753, "learning_rate": 5.8950853068500134e-05, "loss": 2.0001, "step": 1615 }, { "epoch": 0.41, "grad_norm": 0.2902436225413904, "learning_rate": 5.8925388337153046e-05, "loss": 2.0014, "step": 1616 }, { "epoch": 0.41, "grad_norm": 0.28148702064899217, "learning_rate": 5.8899923605805964e-05, "loss": 1.888, "step": 1617 }, { "epoch": 0.41, "grad_norm": 0.2474122527178358, "learning_rate": 5.887445887445888e-05, "loss": 2.0164, "step": 1618 }, { "epoch": 0.41, "grad_norm": 0.24043364130359207, "learning_rate": 5.884899414311179e-05, "loss": 1.8548, "step": 1619 }, { "epoch": 0.41, "grad_norm": 0.22881301048532374, "learning_rate": 5.882352941176471e-05, "loss": 1.8628, "step": 1620 }, { "epoch": 0.41, "grad_norm": 0.25063492198815107, "learning_rate": 5.879806468041762e-05, "loss": 2.0986, "step": 1621 }, { "epoch": 0.41, "grad_norm": 0.2323713167482444, "learning_rate": 5.877259994907054e-05, "loss": 2.1365, "step": 1622 }, { "epoch": 0.41, "grad_norm": 0.24691177076215523, "learning_rate": 5.874713521772346e-05, "loss": 1.9383, "step": 1623 }, { "epoch": 0.41, "grad_norm": 0.22673647054803103, "learning_rate": 5.872167048637637e-05, "loss": 1.8255, "step": 1624 }, { "epoch": 0.41, "grad_norm": 0.24692504743712895, "learning_rate": 5.869620575502929e-05, "loss": 2.0798, "step": 1625 }, { "epoch": 0.41, "grad_norm": 0.2667273200367276, "learning_rate": 5.867074102368221e-05, "loss": 1.8868, "step": 1626 }, { "epoch": 0.41, "grad_norm": 0.22908787113265164, "learning_rate": 5.864527629233512e-05, "loss": 1.956, "step": 1627 }, { "epoch": 0.41, "grad_norm": 0.2246979414433047, "learning_rate": 5.861981156098804e-05, "loss": 2.1121, "step": 1628 }, { "epoch": 0.41, "grad_norm": 0.22929133289924503, "learning_rate": 5.859434682964094e-05, "loss": 1.8558, "step": 1629 }, { "epoch": 0.41, "grad_norm": 0.22917793347376852, "learning_rate": 5.856888209829386e-05, "loss": 1.6781, "step": 1630 }, { "epoch": 0.42, "grad_norm": 0.25641446650774763, "learning_rate": 5.854341736694679e-05, "loss": 1.7482, "step": 1631 }, { "epoch": 0.42, "grad_norm": 0.23393537298015707, "learning_rate": 5.851795263559969e-05, "loss": 1.8537, "step": 1632 }, { "epoch": 0.42, "grad_norm": 0.24914347429581077, "learning_rate": 5.849248790425261e-05, "loss": 1.888, "step": 1633 }, { "epoch": 0.42, "grad_norm": 0.24546671260082917, "learning_rate": 5.846702317290553e-05, "loss": 2.0078, "step": 1634 }, { "epoch": 0.42, "grad_norm": 0.21882815327315605, "learning_rate": 5.844155844155844e-05, "loss": 1.8693, "step": 1635 }, { "epoch": 0.42, "grad_norm": 0.2596485332949677, "learning_rate": 5.841609371021136e-05, "loss": 1.9116, "step": 1636 }, { "epoch": 0.42, "grad_norm": 0.22340332983736189, "learning_rate": 5.839062897886428e-05, "loss": 1.6942, "step": 1637 }, { "epoch": 0.42, "grad_norm": 0.30014327124952705, "learning_rate": 5.836516424751719e-05, "loss": 2.0411, "step": 1638 }, { "epoch": 0.42, "grad_norm": 0.25882642944038486, "learning_rate": 5.833969951617011e-05, "loss": 1.6902, "step": 1639 }, { "epoch": 0.42, "grad_norm": 0.25289569169685344, "learning_rate": 5.831423478482302e-05, "loss": 1.9374, "step": 1640 }, { "epoch": 0.42, "grad_norm": 0.27586659091704435, "learning_rate": 5.8288770053475936e-05, "loss": 1.6424, "step": 1641 }, { "epoch": 0.42, "grad_norm": 0.27927541316876636, "learning_rate": 5.826330532212886e-05, "loss": 1.9862, "step": 1642 }, { "epoch": 0.42, "grad_norm": 0.3011801539172539, "learning_rate": 5.8237840590781767e-05, "loss": 2.32, "step": 1643 }, { "epoch": 0.42, "grad_norm": 0.3050488261319834, "learning_rate": 5.8212375859434685e-05, "loss": 1.9606, "step": 1644 }, { "epoch": 0.42, "grad_norm": 0.2567523341578881, "learning_rate": 5.8186911128087604e-05, "loss": 2.035, "step": 1645 }, { "epoch": 0.42, "grad_norm": 0.22761743857463332, "learning_rate": 5.8161446396740516e-05, "loss": 1.8363, "step": 1646 }, { "epoch": 0.42, "grad_norm": 0.2328384256968828, "learning_rate": 5.8135981665393435e-05, "loss": 1.8999, "step": 1647 }, { "epoch": 0.42, "grad_norm": 0.32891412485440624, "learning_rate": 5.811051693404635e-05, "loss": 2.0234, "step": 1648 }, { "epoch": 0.42, "grad_norm": 0.2773328026773537, "learning_rate": 5.8085052202699266e-05, "loss": 1.807, "step": 1649 }, { "epoch": 0.42, "grad_norm": 0.26592009645438053, "learning_rate": 5.8059587471352185e-05, "loss": 1.7819, "step": 1650 }, { "epoch": 0.42, "grad_norm": 0.2350560307412541, "learning_rate": 5.803412274000509e-05, "loss": 1.854, "step": 1651 }, { "epoch": 0.42, "grad_norm": 0.2211039962011883, "learning_rate": 5.800865800865801e-05, "loss": 2.0175, "step": 1652 }, { "epoch": 0.42, "grad_norm": 0.2956828222523623, "learning_rate": 5.7983193277310935e-05, "loss": 2.0187, "step": 1653 }, { "epoch": 0.42, "grad_norm": 0.24018430575744007, "learning_rate": 5.795772854596384e-05, "loss": 1.6413, "step": 1654 }, { "epoch": 0.42, "grad_norm": 0.2851889082091988, "learning_rate": 5.793226381461676e-05, "loss": 2.0801, "step": 1655 }, { "epoch": 0.42, "grad_norm": 0.26336700333348967, "learning_rate": 5.790679908326967e-05, "loss": 1.831, "step": 1656 }, { "epoch": 0.42, "grad_norm": 0.24335249528299224, "learning_rate": 5.788133435192259e-05, "loss": 1.9065, "step": 1657 }, { "epoch": 0.42, "grad_norm": 0.3074932999384943, "learning_rate": 5.785586962057551e-05, "loss": 1.6846, "step": 1658 }, { "epoch": 0.42, "grad_norm": 0.2929328642165607, "learning_rate": 5.783040488922842e-05, "loss": 2.117, "step": 1659 }, { "epoch": 0.42, "grad_norm": 0.3113969049616286, "learning_rate": 5.780494015788134e-05, "loss": 2.0645, "step": 1660 }, { "epoch": 0.42, "grad_norm": 0.2937729796085543, "learning_rate": 5.777947542653426e-05, "loss": 1.9968, "step": 1661 }, { "epoch": 0.42, "grad_norm": 0.26455967107729744, "learning_rate": 5.7754010695187164e-05, "loss": 1.9099, "step": 1662 }, { "epoch": 0.42, "grad_norm": 0.2721072673688603, "learning_rate": 5.772854596384009e-05, "loss": 2.1461, "step": 1663 }, { "epoch": 0.42, "grad_norm": 0.23609306531007804, "learning_rate": 5.7703081232492995e-05, "loss": 2.0404, "step": 1664 }, { "epoch": 0.42, "grad_norm": 0.24975060152577083, "learning_rate": 5.7677616501145914e-05, "loss": 1.8813, "step": 1665 }, { "epoch": 0.42, "grad_norm": 0.2617133125765185, "learning_rate": 5.765215176979883e-05, "loss": 2.016, "step": 1666 }, { "epoch": 0.42, "grad_norm": 0.276917385060399, "learning_rate": 5.7626687038451745e-05, "loss": 1.8029, "step": 1667 }, { "epoch": 0.42, "grad_norm": 0.2426905483611426, "learning_rate": 5.760122230710466e-05, "loss": 1.8359, "step": 1668 }, { "epoch": 0.42, "grad_norm": 0.24819638409704883, "learning_rate": 5.757575757575758e-05, "loss": 2.1224, "step": 1669 }, { "epoch": 0.43, "grad_norm": 0.2285441111880456, "learning_rate": 5.7550292844410494e-05, "loss": 1.7729, "step": 1670 }, { "epoch": 0.43, "grad_norm": 0.21992829281266243, "learning_rate": 5.752482811306341e-05, "loss": 1.8653, "step": 1671 }, { "epoch": 0.43, "grad_norm": 0.2455672657189347, "learning_rate": 5.749936338171632e-05, "loss": 1.8892, "step": 1672 }, { "epoch": 0.43, "grad_norm": 0.21163598726865365, "learning_rate": 5.747389865036924e-05, "loss": 1.7959, "step": 1673 }, { "epoch": 0.43, "grad_norm": 0.24640105754241015, "learning_rate": 5.744843391902216e-05, "loss": 1.9364, "step": 1674 }, { "epoch": 0.43, "grad_norm": 0.26734456080587427, "learning_rate": 5.742296918767507e-05, "loss": 1.9096, "step": 1675 }, { "epoch": 0.43, "grad_norm": 0.26069813616960297, "learning_rate": 5.739750445632799e-05, "loss": 1.9483, "step": 1676 }, { "epoch": 0.43, "grad_norm": 0.23182803512512878, "learning_rate": 5.7372039724980906e-05, "loss": 1.6035, "step": 1677 }, { "epoch": 0.43, "grad_norm": 0.24343677532886382, "learning_rate": 5.734657499363382e-05, "loss": 1.854, "step": 1678 }, { "epoch": 0.43, "grad_norm": 0.2559604154069802, "learning_rate": 5.732111026228674e-05, "loss": 2.1057, "step": 1679 }, { "epoch": 0.43, "grad_norm": 0.24903703540252603, "learning_rate": 5.7295645530939656e-05, "loss": 1.9747, "step": 1680 }, { "epoch": 0.43, "grad_norm": 0.24292430011120034, "learning_rate": 5.727018079959257e-05, "loss": 1.6008, "step": 1681 }, { "epoch": 0.43, "grad_norm": 0.2597732829635861, "learning_rate": 5.724471606824549e-05, "loss": 1.8885, "step": 1682 }, { "epoch": 0.43, "grad_norm": 0.23894214436574637, "learning_rate": 5.721925133689839e-05, "loss": 1.8221, "step": 1683 }, { "epoch": 0.43, "grad_norm": 0.29846146472358814, "learning_rate": 5.719378660555131e-05, "loss": 1.7687, "step": 1684 }, { "epoch": 0.43, "grad_norm": 0.2435618750544854, "learning_rate": 5.7168321874204236e-05, "loss": 1.9643, "step": 1685 }, { "epoch": 0.43, "grad_norm": 0.20434998720844322, "learning_rate": 5.714285714285714e-05, "loss": 1.806, "step": 1686 }, { "epoch": 0.43, "grad_norm": 0.28126539082271323, "learning_rate": 5.711739241151006e-05, "loss": 2.0432, "step": 1687 }, { "epoch": 0.43, "grad_norm": 0.23609697055702877, "learning_rate": 5.709192768016298e-05, "loss": 1.7398, "step": 1688 }, { "epoch": 0.43, "grad_norm": 0.2619756036039603, "learning_rate": 5.706646294881589e-05, "loss": 2.0893, "step": 1689 }, { "epoch": 0.43, "grad_norm": 0.23441992170260872, "learning_rate": 5.704099821746881e-05, "loss": 1.808, "step": 1690 }, { "epoch": 0.43, "grad_norm": 0.28369231232122544, "learning_rate": 5.701553348612172e-05, "loss": 2.1341, "step": 1691 }, { "epoch": 0.43, "grad_norm": 0.2109833773420096, "learning_rate": 5.699006875477464e-05, "loss": 1.6876, "step": 1692 }, { "epoch": 0.43, "grad_norm": 0.2108497083722984, "learning_rate": 5.696460402342756e-05, "loss": 1.6995, "step": 1693 }, { "epoch": 0.43, "grad_norm": 0.2194012246664097, "learning_rate": 5.6939139292080465e-05, "loss": 1.9067, "step": 1694 }, { "epoch": 0.43, "grad_norm": 0.2713182613514185, "learning_rate": 5.6913674560733384e-05, "loss": 1.8971, "step": 1695 }, { "epoch": 0.43, "grad_norm": 0.296808887950505, "learning_rate": 5.688820982938631e-05, "loss": 2.255, "step": 1696 }, { "epoch": 0.43, "grad_norm": 0.2619363961593609, "learning_rate": 5.6862745098039215e-05, "loss": 1.9257, "step": 1697 }, { "epoch": 0.43, "grad_norm": 0.2310624315603793, "learning_rate": 5.6837280366692134e-05, "loss": 1.7678, "step": 1698 }, { "epoch": 0.43, "grad_norm": 0.28676857476407935, "learning_rate": 5.6811815635345046e-05, "loss": 1.8109, "step": 1699 }, { "epoch": 0.43, "grad_norm": 0.2540912101214253, "learning_rate": 5.6786350903997965e-05, "loss": 1.822, "step": 1700 }, { "epoch": 0.43, "grad_norm": 0.2529094201882292, "learning_rate": 5.6760886172650884e-05, "loss": 1.9275, "step": 1701 }, { "epoch": 0.43, "grad_norm": 0.2851786147452933, "learning_rate": 5.6735421441303796e-05, "loss": 1.9064, "step": 1702 }, { "epoch": 0.43, "grad_norm": 0.21299800439580308, "learning_rate": 5.6709956709956715e-05, "loss": 1.6424, "step": 1703 }, { "epoch": 0.43, "grad_norm": 0.23520012094612563, "learning_rate": 5.6684491978609634e-05, "loss": 1.8498, "step": 1704 }, { "epoch": 0.43, "grad_norm": 0.3675517446718101, "learning_rate": 5.665902724726254e-05, "loss": 1.991, "step": 1705 }, { "epoch": 0.43, "grad_norm": 0.2517874416912386, "learning_rate": 5.6633562515915465e-05, "loss": 1.7943, "step": 1706 }, { "epoch": 0.43, "grad_norm": 0.27320188180075, "learning_rate": 5.660809778456837e-05, "loss": 1.7237, "step": 1707 }, { "epoch": 0.43, "grad_norm": 0.221588208317137, "learning_rate": 5.658263305322129e-05, "loss": 1.6462, "step": 1708 }, { "epoch": 0.43, "grad_norm": 0.22910092053725895, "learning_rate": 5.655716832187421e-05, "loss": 1.935, "step": 1709 }, { "epoch": 0.44, "grad_norm": 0.24057608237244552, "learning_rate": 5.653170359052712e-05, "loss": 1.691, "step": 1710 }, { "epoch": 0.44, "grad_norm": 0.23637377085333372, "learning_rate": 5.650623885918004e-05, "loss": 1.9726, "step": 1711 }, { "epoch": 0.44, "grad_norm": 0.24854888578313664, "learning_rate": 5.648077412783296e-05, "loss": 1.9453, "step": 1712 }, { "epoch": 0.44, "grad_norm": 0.2606071093705677, "learning_rate": 5.645530939648587e-05, "loss": 1.7513, "step": 1713 }, { "epoch": 0.44, "grad_norm": 0.33451192583541084, "learning_rate": 5.642984466513879e-05, "loss": 1.6622, "step": 1714 }, { "epoch": 0.44, "grad_norm": 0.28049286284435937, "learning_rate": 5.640437993379171e-05, "loss": 1.9776, "step": 1715 }, { "epoch": 0.44, "grad_norm": 0.23603799050412522, "learning_rate": 5.637891520244461e-05, "loss": 1.8003, "step": 1716 }, { "epoch": 0.44, "grad_norm": 0.2525222619317922, "learning_rate": 5.635345047109754e-05, "loss": 1.7598, "step": 1717 }, { "epoch": 0.44, "grad_norm": 0.25108993878703484, "learning_rate": 5.632798573975044e-05, "loss": 1.6963, "step": 1718 }, { "epoch": 0.44, "grad_norm": 0.23628532400404173, "learning_rate": 5.630252100840336e-05, "loss": 1.6703, "step": 1719 }, { "epoch": 0.44, "grad_norm": 0.2617072771888496, "learning_rate": 5.627705627705628e-05, "loss": 1.8925, "step": 1720 }, { "epoch": 0.44, "grad_norm": 0.23966982651995303, "learning_rate": 5.625159154570919e-05, "loss": 1.9139, "step": 1721 }, { "epoch": 0.44, "grad_norm": 0.32751289677160395, "learning_rate": 5.622612681436211e-05, "loss": 1.8088, "step": 1722 }, { "epoch": 0.44, "grad_norm": 0.21991901870146513, "learning_rate": 5.620066208301503e-05, "loss": 1.8793, "step": 1723 }, { "epoch": 0.44, "grad_norm": 0.304056051250577, "learning_rate": 5.617519735166794e-05, "loss": 2.3134, "step": 1724 }, { "epoch": 0.44, "grad_norm": 0.28237868700326474, "learning_rate": 5.614973262032086e-05, "loss": 2.0754, "step": 1725 }, { "epoch": 0.44, "grad_norm": 0.28944437474500934, "learning_rate": 5.612426788897377e-05, "loss": 1.8642, "step": 1726 }, { "epoch": 0.44, "grad_norm": 0.2932810708087583, "learning_rate": 5.6098803157626686e-05, "loss": 1.7588, "step": 1727 }, { "epoch": 0.44, "grad_norm": 0.3131703101856893, "learning_rate": 5.607333842627961e-05, "loss": 2.2586, "step": 1728 }, { "epoch": 0.44, "grad_norm": 0.21605552180737272, "learning_rate": 5.604787369493252e-05, "loss": 1.6997, "step": 1729 }, { "epoch": 0.44, "grad_norm": 0.2918401341372067, "learning_rate": 5.6022408963585436e-05, "loss": 2.0401, "step": 1730 }, { "epoch": 0.44, "grad_norm": 0.24155233524645017, "learning_rate": 5.5996944232238355e-05, "loss": 1.8575, "step": 1731 }, { "epoch": 0.44, "grad_norm": 0.26069640711381625, "learning_rate": 5.597147950089127e-05, "loss": 1.8647, "step": 1732 }, { "epoch": 0.44, "grad_norm": 0.2329740136854815, "learning_rate": 5.5946014769544186e-05, "loss": 1.7878, "step": 1733 }, { "epoch": 0.44, "grad_norm": 0.22908269181710408, "learning_rate": 5.59205500381971e-05, "loss": 1.776, "step": 1734 }, { "epoch": 0.44, "grad_norm": 0.31914934779074683, "learning_rate": 5.5895085306850016e-05, "loss": 1.7529, "step": 1735 }, { "epoch": 0.44, "grad_norm": 0.23060766171717703, "learning_rate": 5.5869620575502935e-05, "loss": 1.6373, "step": 1736 }, { "epoch": 0.44, "grad_norm": 0.2634189805776267, "learning_rate": 5.584415584415584e-05, "loss": 1.8803, "step": 1737 }, { "epoch": 0.44, "grad_norm": 0.2214872810805536, "learning_rate": 5.5818691112808766e-05, "loss": 1.7981, "step": 1738 }, { "epoch": 0.44, "grad_norm": 0.2261818779823524, "learning_rate": 5.5793226381461685e-05, "loss": 1.9758, "step": 1739 }, { "epoch": 0.44, "grad_norm": 0.2376195130053332, "learning_rate": 5.576776165011459e-05, "loss": 2.0915, "step": 1740 }, { "epoch": 0.44, "grad_norm": 0.2637069820061634, "learning_rate": 5.574229691876751e-05, "loss": 1.758, "step": 1741 }, { "epoch": 0.44, "grad_norm": 0.23249804426724283, "learning_rate": 5.571683218742042e-05, "loss": 1.7041, "step": 1742 }, { "epoch": 0.44, "grad_norm": 0.25433403071838684, "learning_rate": 5.569136745607334e-05, "loss": 1.8649, "step": 1743 }, { "epoch": 0.44, "grad_norm": 0.24052920509203618, "learning_rate": 5.566590272472626e-05, "loss": 1.9441, "step": 1744 }, { "epoch": 0.44, "grad_norm": 0.2567302065595584, "learning_rate": 5.564043799337917e-05, "loss": 1.9928, "step": 1745 }, { "epoch": 0.44, "grad_norm": 0.21436613920934142, "learning_rate": 5.561497326203209e-05, "loss": 1.5653, "step": 1746 }, { "epoch": 0.44, "grad_norm": 0.3135895688776553, "learning_rate": 5.558950853068501e-05, "loss": 2.0952, "step": 1747 }, { "epoch": 0.44, "grad_norm": 0.24846845380016735, "learning_rate": 5.5564043799337914e-05, "loss": 1.8637, "step": 1748 }, { "epoch": 0.45, "grad_norm": 0.2653113926121906, "learning_rate": 5.553857906799084e-05, "loss": 1.7786, "step": 1749 }, { "epoch": 0.45, "grad_norm": 0.25425882069394706, "learning_rate": 5.5513114336643745e-05, "loss": 2.2434, "step": 1750 }, { "epoch": 0.45, "grad_norm": 0.2466540543020976, "learning_rate": 5.5487649605296664e-05, "loss": 1.8173, "step": 1751 }, { "epoch": 0.45, "grad_norm": 0.23269323327898367, "learning_rate": 5.546218487394958e-05, "loss": 1.8893, "step": 1752 }, { "epoch": 0.45, "grad_norm": 0.19195040406288671, "learning_rate": 5.5436720142602495e-05, "loss": 1.7546, "step": 1753 }, { "epoch": 0.45, "grad_norm": 0.2605782182591182, "learning_rate": 5.5411255411255414e-05, "loss": 2.0306, "step": 1754 }, { "epoch": 0.45, "grad_norm": 0.266186513361073, "learning_rate": 5.538579067990833e-05, "loss": 1.7673, "step": 1755 }, { "epoch": 0.45, "grad_norm": 0.27585153359520465, "learning_rate": 5.5360325948561245e-05, "loss": 1.8655, "step": 1756 }, { "epoch": 0.45, "grad_norm": 0.3288340400930531, "learning_rate": 5.5334861217214163e-05, "loss": 2.2879, "step": 1757 }, { "epoch": 0.45, "grad_norm": 0.2589555456763858, "learning_rate": 5.530939648586708e-05, "loss": 1.8638, "step": 1758 }, { "epoch": 0.45, "grad_norm": 0.23793950353880625, "learning_rate": 5.528393175451999e-05, "loss": 1.8433, "step": 1759 }, { "epoch": 0.45, "grad_norm": 0.24227639074433197, "learning_rate": 5.525846702317291e-05, "loss": 1.8814, "step": 1760 }, { "epoch": 0.45, "grad_norm": 0.24922419664655437, "learning_rate": 5.523300229182582e-05, "loss": 1.8689, "step": 1761 }, { "epoch": 0.45, "grad_norm": 0.24870996379466973, "learning_rate": 5.520753756047874e-05, "loss": 1.7643, "step": 1762 }, { "epoch": 0.45, "grad_norm": 0.2596777573742192, "learning_rate": 5.5182072829131656e-05, "loss": 2.0185, "step": 1763 }, { "epoch": 0.45, "grad_norm": 0.2499650915927096, "learning_rate": 5.515660809778457e-05, "loss": 2.0008, "step": 1764 }, { "epoch": 0.45, "grad_norm": 0.2318288787564209, "learning_rate": 5.513114336643749e-05, "loss": 1.8108, "step": 1765 }, { "epoch": 0.45, "grad_norm": 0.27257386468568495, "learning_rate": 5.5105678635090406e-05, "loss": 1.7635, "step": 1766 }, { "epoch": 0.45, "grad_norm": 0.2992005987613024, "learning_rate": 5.508021390374332e-05, "loss": 1.9779, "step": 1767 }, { "epoch": 0.45, "grad_norm": 0.23588669747860982, "learning_rate": 5.505474917239624e-05, "loss": 1.847, "step": 1768 }, { "epoch": 0.45, "grad_norm": 0.2188394993526254, "learning_rate": 5.502928444104914e-05, "loss": 1.6067, "step": 1769 }, { "epoch": 0.45, "grad_norm": 0.2695534600215359, "learning_rate": 5.500381970970207e-05, "loss": 2.0661, "step": 1770 }, { "epoch": 0.45, "grad_norm": 0.22736149518737195, "learning_rate": 5.497835497835499e-05, "loss": 1.6571, "step": 1771 }, { "epoch": 0.45, "grad_norm": 0.25814689277838104, "learning_rate": 5.495289024700789e-05, "loss": 1.9552, "step": 1772 }, { "epoch": 0.45, "grad_norm": 0.2238504358725983, "learning_rate": 5.492742551566081e-05, "loss": 1.5043, "step": 1773 }, { "epoch": 0.45, "grad_norm": 0.268667772932315, "learning_rate": 5.490196078431373e-05, "loss": 1.7679, "step": 1774 }, { "epoch": 0.45, "grad_norm": 0.27460783772153585, "learning_rate": 5.487649605296664e-05, "loss": 2.1356, "step": 1775 }, { "epoch": 0.45, "grad_norm": 0.2458523368494746, "learning_rate": 5.485103132161956e-05, "loss": 1.7842, "step": 1776 }, { "epoch": 0.45, "grad_norm": 0.25243776123248696, "learning_rate": 5.482556659027247e-05, "loss": 1.9312, "step": 1777 }, { "epoch": 0.45, "grad_norm": 0.265612531818153, "learning_rate": 5.480010185892539e-05, "loss": 2.0646, "step": 1778 }, { "epoch": 0.45, "grad_norm": 0.2769188648549516, "learning_rate": 5.477463712757831e-05, "loss": 1.9808, "step": 1779 }, { "epoch": 0.45, "grad_norm": 0.2503357212615843, "learning_rate": 5.4749172396231216e-05, "loss": 1.9653, "step": 1780 }, { "epoch": 0.45, "grad_norm": 0.2303122134316886, "learning_rate": 5.472370766488414e-05, "loss": 1.4754, "step": 1781 }, { "epoch": 0.45, "grad_norm": 0.20188287439135372, "learning_rate": 5.469824293353706e-05, "loss": 1.7278, "step": 1782 }, { "epoch": 0.45, "grad_norm": 0.2503709425309386, "learning_rate": 5.4672778202189966e-05, "loss": 1.8916, "step": 1783 }, { "epoch": 0.45, "grad_norm": 0.25742335282915996, "learning_rate": 5.4647313470842884e-05, "loss": 1.9465, "step": 1784 }, { "epoch": 0.45, "grad_norm": 0.24559692914006714, "learning_rate": 5.4621848739495796e-05, "loss": 2.2193, "step": 1785 }, { "epoch": 0.45, "grad_norm": 0.23153806188074785, "learning_rate": 5.4596384008148715e-05, "loss": 1.8863, "step": 1786 }, { "epoch": 0.45, "grad_norm": 0.26187621546533785, "learning_rate": 5.4570919276801634e-05, "loss": 1.7773, "step": 1787 }, { "epoch": 0.46, "grad_norm": 0.24259248835353403, "learning_rate": 5.4545454545454546e-05, "loss": 1.8412, "step": 1788 }, { "epoch": 0.46, "grad_norm": 0.24390738217557198, "learning_rate": 5.4519989814107465e-05, "loss": 1.9268, "step": 1789 }, { "epoch": 0.46, "grad_norm": 0.26336608391576083, "learning_rate": 5.4494525082760384e-05, "loss": 1.903, "step": 1790 }, { "epoch": 0.46, "grad_norm": 0.26897674687117284, "learning_rate": 5.446906035141329e-05, "loss": 1.9287, "step": 1791 }, { "epoch": 0.46, "grad_norm": 0.2731898821489289, "learning_rate": 5.4443595620066215e-05, "loss": 1.879, "step": 1792 }, { "epoch": 0.46, "grad_norm": 0.2379223434399539, "learning_rate": 5.4418130888719134e-05, "loss": 1.8735, "step": 1793 }, { "epoch": 0.46, "grad_norm": 0.25522130654246694, "learning_rate": 5.439266615737204e-05, "loss": 1.7318, "step": 1794 }, { "epoch": 0.46, "grad_norm": 0.2856176119126413, "learning_rate": 5.436720142602496e-05, "loss": 2.0151, "step": 1795 }, { "epoch": 0.46, "grad_norm": 0.23155621796811385, "learning_rate": 5.434173669467787e-05, "loss": 1.7329, "step": 1796 }, { "epoch": 0.46, "grad_norm": 0.26792895852977083, "learning_rate": 5.431627196333079e-05, "loss": 1.9689, "step": 1797 }, { "epoch": 0.46, "grad_norm": 0.2530283201692764, "learning_rate": 5.429080723198371e-05, "loss": 1.9712, "step": 1798 }, { "epoch": 0.46, "grad_norm": 0.27743368973922417, "learning_rate": 5.426534250063662e-05, "loss": 1.7018, "step": 1799 }, { "epoch": 0.46, "grad_norm": 0.2177459459956652, "learning_rate": 5.423987776928954e-05, "loss": 1.63, "step": 1800 }, { "epoch": 0.46, "grad_norm": 0.2668439859821881, "learning_rate": 5.421441303794246e-05, "loss": 2.0334, "step": 1801 }, { "epoch": 0.46, "grad_norm": 0.245502397208931, "learning_rate": 5.418894830659537e-05, "loss": 1.9641, "step": 1802 }, { "epoch": 0.46, "grad_norm": 0.2550140861285763, "learning_rate": 5.416348357524829e-05, "loss": 2.1039, "step": 1803 }, { "epoch": 0.46, "grad_norm": 0.26067228487509947, "learning_rate": 5.4138018843901194e-05, "loss": 1.9323, "step": 1804 }, { "epoch": 0.46, "grad_norm": 0.27090074391060487, "learning_rate": 5.411255411255411e-05, "loss": 1.7093, "step": 1805 }, { "epoch": 0.46, "grad_norm": 0.21757379284420492, "learning_rate": 5.408708938120703e-05, "loss": 1.5382, "step": 1806 }, { "epoch": 0.46, "grad_norm": 0.339007192047313, "learning_rate": 5.4061624649859943e-05, "loss": 2.0086, "step": 1807 }, { "epoch": 0.46, "grad_norm": 0.20279521947870238, "learning_rate": 5.403615991851286e-05, "loss": 1.6432, "step": 1808 }, { "epoch": 0.46, "grad_norm": 0.28961860158386293, "learning_rate": 5.401069518716578e-05, "loss": 1.9225, "step": 1809 }, { "epoch": 0.46, "grad_norm": 0.3011554396878597, "learning_rate": 5.398523045581869e-05, "loss": 1.9699, "step": 1810 }, { "epoch": 0.46, "grad_norm": 0.23162855467459323, "learning_rate": 5.395976572447161e-05, "loss": 1.5771, "step": 1811 }, { "epoch": 0.46, "grad_norm": 0.24026675897680333, "learning_rate": 5.393430099312452e-05, "loss": 1.8187, "step": 1812 }, { "epoch": 0.46, "grad_norm": 0.23333424918528609, "learning_rate": 5.390883626177744e-05, "loss": 1.9024, "step": 1813 }, { "epoch": 0.46, "grad_norm": 0.232109597333444, "learning_rate": 5.388337153043036e-05, "loss": 1.7673, "step": 1814 }, { "epoch": 0.46, "grad_norm": 0.25877377446919725, "learning_rate": 5.385790679908327e-05, "loss": 1.9995, "step": 1815 }, { "epoch": 0.46, "grad_norm": 0.2717296535197353, "learning_rate": 5.3832442067736186e-05, "loss": 1.9133, "step": 1816 }, { "epoch": 0.46, "grad_norm": 0.2588506805190288, "learning_rate": 5.380697733638911e-05, "loss": 1.9533, "step": 1817 }, { "epoch": 0.46, "grad_norm": 0.22576180740646443, "learning_rate": 5.378151260504202e-05, "loss": 1.7461, "step": 1818 }, { "epoch": 0.46, "grad_norm": 0.3066987268108326, "learning_rate": 5.3756047873694936e-05, "loss": 2.0566, "step": 1819 }, { "epoch": 0.46, "grad_norm": 0.2844623326773415, "learning_rate": 5.373058314234785e-05, "loss": 1.8276, "step": 1820 }, { "epoch": 0.46, "grad_norm": 0.29458131137047866, "learning_rate": 5.370511841100077e-05, "loss": 2.1424, "step": 1821 }, { "epoch": 0.46, "grad_norm": 0.36238895063597765, "learning_rate": 5.3679653679653686e-05, "loss": 2.0555, "step": 1822 }, { "epoch": 0.46, "grad_norm": 0.24611408664295198, "learning_rate": 5.365418894830659e-05, "loss": 1.7803, "step": 1823 }, { "epoch": 0.46, "grad_norm": 0.3008588467863008, "learning_rate": 5.3628724216959516e-05, "loss": 1.9502, "step": 1824 }, { "epoch": 0.46, "grad_norm": 0.27450625136992457, "learning_rate": 5.3603259485612435e-05, "loss": 2.0259, "step": 1825 }, { "epoch": 0.46, "grad_norm": 0.2666855843857169, "learning_rate": 5.357779475426534e-05, "loss": 1.8695, "step": 1826 }, { "epoch": 0.47, "grad_norm": 0.28164166094500825, "learning_rate": 5.355233002291826e-05, "loss": 2.0724, "step": 1827 }, { "epoch": 0.47, "grad_norm": 0.23709120390140725, "learning_rate": 5.352686529157117e-05, "loss": 1.9491, "step": 1828 }, { "epoch": 0.47, "grad_norm": 0.23017238700218814, "learning_rate": 5.350140056022409e-05, "loss": 1.743, "step": 1829 }, { "epoch": 0.47, "grad_norm": 0.271085734619902, "learning_rate": 5.347593582887701e-05, "loss": 1.9874, "step": 1830 }, { "epoch": 0.47, "grad_norm": 0.23922151562707258, "learning_rate": 5.345047109752992e-05, "loss": 1.9307, "step": 1831 }, { "epoch": 0.47, "grad_norm": 0.2518359954393692, "learning_rate": 5.342500636618284e-05, "loss": 1.8122, "step": 1832 }, { "epoch": 0.47, "grad_norm": 0.24450502285449374, "learning_rate": 5.339954163483576e-05, "loss": 1.6499, "step": 1833 }, { "epoch": 0.47, "grad_norm": 0.2874697544939995, "learning_rate": 5.337407690348867e-05, "loss": 2.2582, "step": 1834 }, { "epoch": 0.47, "grad_norm": 0.2691866980594534, "learning_rate": 5.334861217214159e-05, "loss": 1.8869, "step": 1835 }, { "epoch": 0.47, "grad_norm": 0.2238018696217045, "learning_rate": 5.332314744079451e-05, "loss": 1.9598, "step": 1836 }, { "epoch": 0.47, "grad_norm": 0.23242987290413744, "learning_rate": 5.3297682709447414e-05, "loss": 1.8713, "step": 1837 }, { "epoch": 0.47, "grad_norm": 0.27281451782756266, "learning_rate": 5.327221797810033e-05, "loss": 1.9389, "step": 1838 }, { "epoch": 0.47, "grad_norm": 0.2587967493607052, "learning_rate": 5.3246753246753245e-05, "loss": 2.1464, "step": 1839 }, { "epoch": 0.47, "grad_norm": 0.23820474052944823, "learning_rate": 5.3221288515406164e-05, "loss": 1.8616, "step": 1840 }, { "epoch": 0.47, "grad_norm": 0.3003991908326661, "learning_rate": 5.319582378405908e-05, "loss": 1.8697, "step": 1841 }, { "epoch": 0.47, "grad_norm": 0.24640073248782213, "learning_rate": 5.3170359052711995e-05, "loss": 1.7793, "step": 1842 }, { "epoch": 0.47, "grad_norm": 0.25041861057849335, "learning_rate": 5.3144894321364914e-05, "loss": 1.876, "step": 1843 }, { "epoch": 0.47, "grad_norm": 0.25193421344605454, "learning_rate": 5.311942959001783e-05, "loss": 1.8664, "step": 1844 }, { "epoch": 0.47, "grad_norm": 0.2665414008704307, "learning_rate": 5.3093964858670745e-05, "loss": 1.8365, "step": 1845 }, { "epoch": 0.47, "grad_norm": 0.22118430726236193, "learning_rate": 5.3068500127323663e-05, "loss": 1.6602, "step": 1846 }, { "epoch": 0.47, "grad_norm": 0.21641116153913692, "learning_rate": 5.304303539597657e-05, "loss": 1.7391, "step": 1847 }, { "epoch": 0.47, "grad_norm": 0.22607660807259428, "learning_rate": 5.301757066462949e-05, "loss": 1.7965, "step": 1848 }, { "epoch": 0.47, "grad_norm": 0.29830176926819324, "learning_rate": 5.299210593328241e-05, "loss": 1.6232, "step": 1849 }, { "epoch": 0.47, "grad_norm": 0.2354726123936021, "learning_rate": 5.296664120193532e-05, "loss": 1.777, "step": 1850 }, { "epoch": 0.47, "grad_norm": 0.257301401939976, "learning_rate": 5.294117647058824e-05, "loss": 1.7204, "step": 1851 }, { "epoch": 0.47, "grad_norm": 0.36778423842474345, "learning_rate": 5.2915711739241156e-05, "loss": 1.949, "step": 1852 }, { "epoch": 0.47, "grad_norm": 0.25158026555515783, "learning_rate": 5.289024700789407e-05, "loss": 1.6875, "step": 1853 }, { "epoch": 0.47, "grad_norm": 0.20881459657209697, "learning_rate": 5.286478227654699e-05, "loss": 1.7929, "step": 1854 }, { "epoch": 0.47, "grad_norm": 0.256358090142942, "learning_rate": 5.283931754519989e-05, "loss": 1.6186, "step": 1855 }, { "epoch": 0.47, "grad_norm": 0.2272036130351749, "learning_rate": 5.281385281385282e-05, "loss": 1.7572, "step": 1856 }, { "epoch": 0.47, "grad_norm": 0.35342186796256736, "learning_rate": 5.278838808250574e-05, "loss": 2.3487, "step": 1857 }, { "epoch": 0.47, "grad_norm": 0.2707479354519031, "learning_rate": 5.276292335115864e-05, "loss": 1.7802, "step": 1858 }, { "epoch": 0.47, "grad_norm": 0.23101653447518822, "learning_rate": 5.273745861981156e-05, "loss": 1.7922, "step": 1859 }, { "epoch": 0.47, "grad_norm": 0.2595925292186933, "learning_rate": 5.271199388846449e-05, "loss": 1.7631, "step": 1860 }, { "epoch": 0.47, "grad_norm": 0.30190522873815023, "learning_rate": 5.268652915711739e-05, "loss": 2.0733, "step": 1861 }, { "epoch": 0.47, "grad_norm": 0.26821170058681754, "learning_rate": 5.266106442577031e-05, "loss": 1.724, "step": 1862 }, { "epoch": 0.47, "grad_norm": 0.2664055016023298, "learning_rate": 5.263559969442322e-05, "loss": 2.0899, "step": 1863 }, { "epoch": 0.47, "grad_norm": 0.29562273179891857, "learning_rate": 5.261013496307614e-05, "loss": 2.1661, "step": 1864 }, { "epoch": 0.47, "grad_norm": 0.24865126380269392, "learning_rate": 5.258467023172906e-05, "loss": 1.7643, "step": 1865 }, { "epoch": 0.47, "grad_norm": 0.2373386719783883, "learning_rate": 5.255920550038197e-05, "loss": 1.8237, "step": 1866 }, { "epoch": 0.48, "grad_norm": 0.2553863812569826, "learning_rate": 5.253374076903489e-05, "loss": 1.8489, "step": 1867 }, { "epoch": 0.48, "grad_norm": 0.26032612498186936, "learning_rate": 5.250827603768781e-05, "loss": 1.6922, "step": 1868 }, { "epoch": 0.48, "grad_norm": 0.22680692806164088, "learning_rate": 5.2482811306340716e-05, "loss": 1.8473, "step": 1869 }, { "epoch": 0.48, "grad_norm": 0.24847787672899788, "learning_rate": 5.2457346574993635e-05, "loss": 1.6559, "step": 1870 }, { "epoch": 0.48, "grad_norm": 0.2480300987758116, "learning_rate": 5.243188184364656e-05, "loss": 1.9783, "step": 1871 }, { "epoch": 0.48, "grad_norm": 0.23003737440119815, "learning_rate": 5.2406417112299466e-05, "loss": 1.7818, "step": 1872 }, { "epoch": 0.48, "grad_norm": 0.26150164922956376, "learning_rate": 5.2380952380952384e-05, "loss": 1.8185, "step": 1873 }, { "epoch": 0.48, "grad_norm": 0.2949240981264513, "learning_rate": 5.2355487649605297e-05, "loss": 1.8645, "step": 1874 }, { "epoch": 0.48, "grad_norm": 0.26624205387149846, "learning_rate": 5.2330022918258215e-05, "loss": 1.8419, "step": 1875 }, { "epoch": 0.48, "grad_norm": 0.2664165203989785, "learning_rate": 5.2304558186911134e-05, "loss": 2.0172, "step": 1876 }, { "epoch": 0.48, "grad_norm": 0.26079884460045555, "learning_rate": 5.2279093455564046e-05, "loss": 2.0923, "step": 1877 }, { "epoch": 0.48, "grad_norm": 0.2581785955045546, "learning_rate": 5.2253628724216965e-05, "loss": 1.7634, "step": 1878 }, { "epoch": 0.48, "grad_norm": 0.23690102235485708, "learning_rate": 5.2228163992869884e-05, "loss": 1.8505, "step": 1879 }, { "epoch": 0.48, "grad_norm": 0.24509423745002412, "learning_rate": 5.220269926152279e-05, "loss": 1.7104, "step": 1880 }, { "epoch": 0.48, "grad_norm": 0.2330443277708947, "learning_rate": 5.217723453017571e-05, "loss": 1.6868, "step": 1881 }, { "epoch": 0.48, "grad_norm": 0.26653664885071376, "learning_rate": 5.215176979882862e-05, "loss": 2.1852, "step": 1882 }, { "epoch": 0.48, "grad_norm": 0.23328075873903847, "learning_rate": 5.212630506748154e-05, "loss": 1.6535, "step": 1883 }, { "epoch": 0.48, "grad_norm": 0.26751635875771723, "learning_rate": 5.210084033613446e-05, "loss": 1.8256, "step": 1884 }, { "epoch": 0.48, "grad_norm": 0.2456733351995663, "learning_rate": 5.207537560478737e-05, "loss": 2.0157, "step": 1885 }, { "epoch": 0.48, "grad_norm": 0.2917524265614974, "learning_rate": 5.204991087344029e-05, "loss": 1.7418, "step": 1886 }, { "epoch": 0.48, "grad_norm": 0.23558486360113196, "learning_rate": 5.202444614209321e-05, "loss": 1.563, "step": 1887 }, { "epoch": 0.48, "grad_norm": 0.23351339278521402, "learning_rate": 5.199898141074612e-05, "loss": 1.9542, "step": 1888 }, { "epoch": 0.48, "grad_norm": 0.24245373643445367, "learning_rate": 5.197351667939904e-05, "loss": 1.6765, "step": 1889 }, { "epoch": 0.48, "grad_norm": 0.26288320068940174, "learning_rate": 5.1948051948051944e-05, "loss": 1.9594, "step": 1890 }, { "epoch": 0.48, "grad_norm": 0.27609982717547316, "learning_rate": 5.192258721670486e-05, "loss": 1.8476, "step": 1891 }, { "epoch": 0.48, "grad_norm": 0.24672020543921191, "learning_rate": 5.189712248535779e-05, "loss": 2.0675, "step": 1892 }, { "epoch": 0.48, "grad_norm": 0.22504124793150443, "learning_rate": 5.1871657754010694e-05, "loss": 1.8047, "step": 1893 }, { "epoch": 0.48, "grad_norm": 0.2836563430101798, "learning_rate": 5.184619302266361e-05, "loss": 2.2403, "step": 1894 }, { "epoch": 0.48, "grad_norm": 0.25871308848958324, "learning_rate": 5.182072829131653e-05, "loss": 1.6891, "step": 1895 }, { "epoch": 0.48, "grad_norm": 0.2565945263108578, "learning_rate": 5.1795263559969444e-05, "loss": 1.8522, "step": 1896 }, { "epoch": 0.48, "grad_norm": 0.30505080448764227, "learning_rate": 5.176979882862236e-05, "loss": 1.9909, "step": 1897 }, { "epoch": 0.48, "grad_norm": 0.21616474130190202, "learning_rate": 5.174433409727527e-05, "loss": 1.9069, "step": 1898 }, { "epoch": 0.48, "grad_norm": 0.3571246636388768, "learning_rate": 5.171886936592819e-05, "loss": 2.0079, "step": 1899 }, { "epoch": 0.48, "grad_norm": 0.2444983341378719, "learning_rate": 5.169340463458111e-05, "loss": 1.9935, "step": 1900 }, { "epoch": 0.48, "grad_norm": 0.286823805394391, "learning_rate": 5.166793990323402e-05, "loss": 1.9886, "step": 1901 }, { "epoch": 0.48, "grad_norm": 0.27263026291263703, "learning_rate": 5.1642475171886936e-05, "loss": 2.0349, "step": 1902 }, { "epoch": 0.48, "grad_norm": 0.19583473006386165, "learning_rate": 5.161701044053986e-05, "loss": 1.5778, "step": 1903 }, { "epoch": 0.48, "grad_norm": 0.30509354360695173, "learning_rate": 5.159154570919277e-05, "loss": 1.9218, "step": 1904 }, { "epoch": 0.48, "grad_norm": 0.23055331246706456, "learning_rate": 5.1566080977845686e-05, "loss": 1.8517, "step": 1905 }, { "epoch": 0.49, "grad_norm": 0.25433625708835156, "learning_rate": 5.15406162464986e-05, "loss": 1.8759, "step": 1906 }, { "epoch": 0.49, "grad_norm": 0.2331749632299269, "learning_rate": 5.151515151515152e-05, "loss": 1.8752, "step": 1907 }, { "epoch": 0.49, "grad_norm": 0.27945951248133655, "learning_rate": 5.1489686783804436e-05, "loss": 1.7539, "step": 1908 }, { "epoch": 0.49, "grad_norm": 0.266353240010889, "learning_rate": 5.146422205245735e-05, "loss": 1.672, "step": 1909 }, { "epoch": 0.49, "grad_norm": 0.2553481753624911, "learning_rate": 5.143875732111027e-05, "loss": 1.7656, "step": 1910 }, { "epoch": 0.49, "grad_norm": 0.26020543432542415, "learning_rate": 5.1413292589763186e-05, "loss": 1.8238, "step": 1911 }, { "epoch": 0.49, "grad_norm": 0.26424780218388744, "learning_rate": 5.138782785841609e-05, "loss": 1.8434, "step": 1912 }, { "epoch": 0.49, "grad_norm": 0.2711295134510327, "learning_rate": 5.136236312706901e-05, "loss": 1.9735, "step": 1913 }, { "epoch": 0.49, "grad_norm": 0.2157097097218708, "learning_rate": 5.1336898395721935e-05, "loss": 1.626, "step": 1914 }, { "epoch": 0.49, "grad_norm": 0.32121250434724363, "learning_rate": 5.131143366437484e-05, "loss": 2.1219, "step": 1915 }, { "epoch": 0.49, "grad_norm": 0.2880472706853323, "learning_rate": 5.128596893302776e-05, "loss": 2.1724, "step": 1916 }, { "epoch": 0.49, "grad_norm": 0.24202615224002272, "learning_rate": 5.126050420168067e-05, "loss": 2.102, "step": 1917 }, { "epoch": 0.49, "grad_norm": 0.20535616419096947, "learning_rate": 5.123503947033359e-05, "loss": 1.5963, "step": 1918 }, { "epoch": 0.49, "grad_norm": 0.24119444261746867, "learning_rate": 5.120957473898651e-05, "loss": 1.7542, "step": 1919 }, { "epoch": 0.49, "grad_norm": 0.27009931768636947, "learning_rate": 5.118411000763942e-05, "loss": 1.8226, "step": 1920 }, { "epoch": 0.49, "grad_norm": 0.2702193253961999, "learning_rate": 5.115864527629234e-05, "loss": 1.9364, "step": 1921 }, { "epoch": 0.49, "grad_norm": 0.2631773372890249, "learning_rate": 5.113318054494526e-05, "loss": 1.9965, "step": 1922 }, { "epoch": 0.49, "grad_norm": 0.26041596253617777, "learning_rate": 5.1107715813598164e-05, "loss": 1.6373, "step": 1923 }, { "epoch": 0.49, "grad_norm": 0.3008801929182643, "learning_rate": 5.108225108225109e-05, "loss": 2.1005, "step": 1924 }, { "epoch": 0.49, "grad_norm": 0.2817868698243369, "learning_rate": 5.1056786350903995e-05, "loss": 1.9543, "step": 1925 }, { "epoch": 0.49, "grad_norm": 0.23752655834477365, "learning_rate": 5.1031321619556914e-05, "loss": 1.7105, "step": 1926 }, { "epoch": 0.49, "grad_norm": 0.29655186735574096, "learning_rate": 5.100585688820983e-05, "loss": 1.9634, "step": 1927 }, { "epoch": 0.49, "grad_norm": 0.20010623345672113, "learning_rate": 5.0980392156862745e-05, "loss": 1.6691, "step": 1928 }, { "epoch": 0.49, "grad_norm": 0.28829755607740787, "learning_rate": 5.0954927425515664e-05, "loss": 1.9682, "step": 1929 }, { "epoch": 0.49, "grad_norm": 0.28987426442953185, "learning_rate": 5.092946269416858e-05, "loss": 1.9083, "step": 1930 }, { "epoch": 0.49, "grad_norm": 0.2243905203666095, "learning_rate": 5.0903997962821495e-05, "loss": 1.6901, "step": 1931 }, { "epoch": 0.49, "grad_norm": 0.2535684686258705, "learning_rate": 5.0878533231474414e-05, "loss": 2.0169, "step": 1932 }, { "epoch": 0.49, "grad_norm": 0.21911503783525632, "learning_rate": 5.085306850012732e-05, "loss": 1.6033, "step": 1933 }, { "epoch": 0.49, "grad_norm": 0.25088447335950004, "learning_rate": 5.082760376878024e-05, "loss": 1.9437, "step": 1934 }, { "epoch": 0.49, "grad_norm": 0.2449779799112241, "learning_rate": 5.0802139037433164e-05, "loss": 1.8041, "step": 1935 }, { "epoch": 0.49, "grad_norm": 0.2579811440487833, "learning_rate": 5.077667430608607e-05, "loss": 1.7179, "step": 1936 }, { "epoch": 0.49, "grad_norm": 0.25607906257541085, "learning_rate": 5.075120957473899e-05, "loss": 1.8907, "step": 1937 }, { "epoch": 0.49, "grad_norm": 0.26921371054512905, "learning_rate": 5.0725744843391907e-05, "loss": 1.9184, "step": 1938 }, { "epoch": 0.49, "grad_norm": 0.2804322806107065, "learning_rate": 5.070028011204482e-05, "loss": 1.9893, "step": 1939 }, { "epoch": 0.49, "grad_norm": 0.2983559012579139, "learning_rate": 5.067481538069774e-05, "loss": 2.0935, "step": 1940 }, { "epoch": 0.49, "grad_norm": 0.22076470857762523, "learning_rate": 5.064935064935065e-05, "loss": 1.6101, "step": 1941 }, { "epoch": 0.49, "grad_norm": 0.22326575894872008, "learning_rate": 5.062388591800357e-05, "loss": 1.692, "step": 1942 }, { "epoch": 0.49, "grad_norm": 0.24204786951620316, "learning_rate": 5.059842118665649e-05, "loss": 1.9055, "step": 1943 }, { "epoch": 0.49, "grad_norm": 0.25870577353603014, "learning_rate": 5.057295645530939e-05, "loss": 1.9083, "step": 1944 }, { "epoch": 0.5, "grad_norm": 0.3110877551004597, "learning_rate": 5.054749172396231e-05, "loss": 2.0132, "step": 1945 }, { "epoch": 0.5, "grad_norm": 0.2437482274431188, "learning_rate": 5.052202699261524e-05, "loss": 1.6122, "step": 1946 }, { "epoch": 0.5, "grad_norm": 0.2994332264479995, "learning_rate": 5.049656226126814e-05, "loss": 1.8626, "step": 1947 }, { "epoch": 0.5, "grad_norm": 0.24268224389050214, "learning_rate": 5.047109752992106e-05, "loss": 1.7933, "step": 1948 }, { "epoch": 0.5, "grad_norm": 0.2883783053021446, "learning_rate": 5.044563279857398e-05, "loss": 1.8863, "step": 1949 }, { "epoch": 0.5, "grad_norm": 0.22717169546645896, "learning_rate": 5.042016806722689e-05, "loss": 1.8561, "step": 1950 }, { "epoch": 0.5, "grad_norm": 0.23447344619764077, "learning_rate": 5.039470333587981e-05, "loss": 1.849, "step": 1951 }, { "epoch": 0.5, "grad_norm": 0.24362514244631286, "learning_rate": 5.036923860453272e-05, "loss": 1.8075, "step": 1952 }, { "epoch": 0.5, "grad_norm": 0.26878464996614415, "learning_rate": 5.034377387318564e-05, "loss": 2.2207, "step": 1953 }, { "epoch": 0.5, "grad_norm": 0.27428559523738466, "learning_rate": 5.031830914183856e-05, "loss": 1.9848, "step": 1954 }, { "epoch": 0.5, "grad_norm": 0.23696908649902115, "learning_rate": 5.0292844410491466e-05, "loss": 1.9916, "step": 1955 }, { "epoch": 0.5, "grad_norm": 0.2436881946799392, "learning_rate": 5.026737967914439e-05, "loss": 1.8764, "step": 1956 }, { "epoch": 0.5, "grad_norm": 0.26938643227677544, "learning_rate": 5.024191494779731e-05, "loss": 1.7523, "step": 1957 }, { "epoch": 0.5, "grad_norm": 0.25242396281496726, "learning_rate": 5.0216450216450216e-05, "loss": 1.9699, "step": 1958 }, { "epoch": 0.5, "grad_norm": 0.21741305241176564, "learning_rate": 5.0190985485103135e-05, "loss": 1.6423, "step": 1959 }, { "epoch": 0.5, "grad_norm": 0.3028444643737473, "learning_rate": 5.016552075375605e-05, "loss": 1.8337, "step": 1960 }, { "epoch": 0.5, "grad_norm": 0.25914168894988954, "learning_rate": 5.0140056022408966e-05, "loss": 2.0263, "step": 1961 }, { "epoch": 0.5, "grad_norm": 0.23670939751206618, "learning_rate": 5.0114591291061884e-05, "loss": 1.7366, "step": 1962 }, { "epoch": 0.5, "grad_norm": 0.24498226814427296, "learning_rate": 5.0089126559714797e-05, "loss": 1.8961, "step": 1963 }, { "epoch": 0.5, "grad_norm": 0.25565121460317763, "learning_rate": 5.0063661828367715e-05, "loss": 1.6696, "step": 1964 }, { "epoch": 0.5, "grad_norm": 0.27791356933350964, "learning_rate": 5.0038197097020634e-05, "loss": 1.9723, "step": 1965 }, { "epoch": 0.5, "grad_norm": 0.29225794713005676, "learning_rate": 5.001273236567354e-05, "loss": 1.938, "step": 1966 }, { "epoch": 0.5, "grad_norm": 0.21643323730040626, "learning_rate": 4.9987267634326465e-05, "loss": 1.8585, "step": 1967 }, { "epoch": 0.5, "grad_norm": 0.23078934951650226, "learning_rate": 4.996180290297938e-05, "loss": 1.7416, "step": 1968 }, { "epoch": 0.5, "grad_norm": 0.22861821569749852, "learning_rate": 4.993633817163229e-05, "loss": 1.908, "step": 1969 }, { "epoch": 0.5, "grad_norm": 0.24688323103788157, "learning_rate": 4.991087344028521e-05, "loss": 1.7285, "step": 1970 }, { "epoch": 0.5, "grad_norm": 0.21804553938715954, "learning_rate": 4.988540870893813e-05, "loss": 1.8245, "step": 1971 }, { "epoch": 0.5, "grad_norm": 0.2532061682179545, "learning_rate": 4.985994397759104e-05, "loss": 1.9595, "step": 1972 }, { "epoch": 0.5, "grad_norm": 0.23886437277956296, "learning_rate": 4.983447924624395e-05, "loss": 1.735, "step": 1973 }, { "epoch": 0.5, "grad_norm": 0.2811014392346057, "learning_rate": 4.980901451489687e-05, "loss": 1.9136, "step": 1974 }, { "epoch": 0.5, "grad_norm": 0.2607547742684514, "learning_rate": 4.978354978354979e-05, "loss": 1.7918, "step": 1975 }, { "epoch": 0.5, "grad_norm": 0.26233559126035727, "learning_rate": 4.97580850522027e-05, "loss": 1.9276, "step": 1976 }, { "epoch": 0.5, "grad_norm": 0.24177929100967677, "learning_rate": 4.973262032085561e-05, "loss": 2.0133, "step": 1977 }, { "epoch": 0.5, "grad_norm": 0.2969375594630229, "learning_rate": 4.970715558950853e-05, "loss": 1.8052, "step": 1978 }, { "epoch": 0.5, "grad_norm": 0.24271014946897948, "learning_rate": 4.968169085816145e-05, "loss": 1.8089, "step": 1979 }, { "epoch": 0.5, "grad_norm": 0.22367153915182864, "learning_rate": 4.965622612681436e-05, "loss": 1.5761, "step": 1980 }, { "epoch": 0.5, "grad_norm": 0.23488780982079455, "learning_rate": 4.963076139546728e-05, "loss": 1.9722, "step": 1981 }, { "epoch": 0.5, "grad_norm": 0.22492626889472872, "learning_rate": 4.9605296664120194e-05, "loss": 1.7289, "step": 1982 }, { "epoch": 0.5, "grad_norm": 0.22267668613544087, "learning_rate": 4.957983193277311e-05, "loss": 1.8544, "step": 1983 }, { "epoch": 0.5, "grad_norm": 0.2807021633506502, "learning_rate": 4.9554367201426025e-05, "loss": 1.9977, "step": 1984 }, { "epoch": 0.51, "grad_norm": 0.26550099339272387, "learning_rate": 4.9528902470078944e-05, "loss": 1.9452, "step": 1985 }, { "epoch": 0.51, "grad_norm": 0.20578276966029152, "learning_rate": 4.950343773873186e-05, "loss": 1.6141, "step": 1986 }, { "epoch": 0.51, "grad_norm": 0.2470067285434968, "learning_rate": 4.9477973007384775e-05, "loss": 1.8066, "step": 1987 }, { "epoch": 0.51, "grad_norm": 0.22999601048399182, "learning_rate": 4.945250827603769e-05, "loss": 1.8548, "step": 1988 }, { "epoch": 0.51, "grad_norm": 0.2474929150617297, "learning_rate": 4.9427043544690605e-05, "loss": 1.8841, "step": 1989 }, { "epoch": 0.51, "grad_norm": 0.2333409865151707, "learning_rate": 4.9401578813343524e-05, "loss": 1.59, "step": 1990 }, { "epoch": 0.51, "grad_norm": 0.22675493716602751, "learning_rate": 4.9376114081996436e-05, "loss": 1.6562, "step": 1991 }, { "epoch": 0.51, "grad_norm": 0.2524665067431181, "learning_rate": 4.9350649350649355e-05, "loss": 1.8689, "step": 1992 }, { "epoch": 0.51, "grad_norm": 0.2626296603049425, "learning_rate": 4.932518461930227e-05, "loss": 1.8856, "step": 1993 }, { "epoch": 0.51, "grad_norm": 0.2791287776633305, "learning_rate": 4.9299719887955186e-05, "loss": 2.0856, "step": 1994 }, { "epoch": 0.51, "grad_norm": 0.22179263762228624, "learning_rate": 4.92742551566081e-05, "loss": 1.8493, "step": 1995 }, { "epoch": 0.51, "grad_norm": 0.2725309329893309, "learning_rate": 4.924879042526102e-05, "loss": 1.9911, "step": 1996 }, { "epoch": 0.51, "grad_norm": 0.2964582781055548, "learning_rate": 4.922332569391393e-05, "loss": 1.9537, "step": 1997 }, { "epoch": 0.51, "grad_norm": 0.2498195712358318, "learning_rate": 4.919786096256685e-05, "loss": 1.9309, "step": 1998 }, { "epoch": 0.51, "grad_norm": 0.2215946387263853, "learning_rate": 4.917239623121977e-05, "loss": 1.8822, "step": 1999 }, { "epoch": 0.51, "grad_norm": 0.23079992196823038, "learning_rate": 4.914693149987268e-05, "loss": 1.8082, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 3929, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }