{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3219, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009319664492078285, "grad_norm": 17.017115535889012, "learning_rate": 6.211180124223603e-08, "loss": 2.0136, "step": 1 }, { "epoch": 0.001863932898415657, "grad_norm": 16.50444155221578, "learning_rate": 1.2422360248447206e-07, "loss": 2.0202, "step": 2 }, { "epoch": 0.0027958993476234857, "grad_norm": 16.16768561675726, "learning_rate": 1.863354037267081e-07, "loss": 1.9589, "step": 3 }, { "epoch": 0.003727865796831314, "grad_norm": 16.787360746297665, "learning_rate": 2.484472049689441e-07, "loss": 1.8803, "step": 4 }, { "epoch": 0.004659832246039142, "grad_norm": 16.448473418809595, "learning_rate": 3.1055900621118013e-07, "loss": 1.9519, "step": 5 }, { "epoch": 0.005591798695246971, "grad_norm": 15.867406042000239, "learning_rate": 3.726708074534162e-07, "loss": 1.9244, "step": 6 }, { "epoch": 0.0065237651444548, "grad_norm": 16.444702941735038, "learning_rate": 4.347826086956522e-07, "loss": 1.9299, "step": 7 }, { "epoch": 0.007455731593662628, "grad_norm": 16.23645738861074, "learning_rate": 4.968944099378882e-07, "loss": 1.9388, "step": 8 }, { "epoch": 0.008387698042870456, "grad_norm": 16.3854800053459, "learning_rate": 5.590062111801243e-07, "loss": 1.947, "step": 9 }, { "epoch": 0.009319664492078284, "grad_norm": 16.76953813514788, "learning_rate": 6.211180124223603e-07, "loss": 1.9288, "step": 10 }, { "epoch": 0.010251630941286114, "grad_norm": 15.95791073898036, "learning_rate": 6.832298136645964e-07, "loss": 1.867, "step": 11 }, { "epoch": 0.011183597390493943, "grad_norm": 16.833568513379348, "learning_rate": 7.453416149068324e-07, "loss": 1.9644, "step": 12 }, { "epoch": 0.012115563839701771, "grad_norm": 16.59713576857796, "learning_rate": 8.074534161490684e-07, "loss": 1.9017, "step": 13 }, { "epoch": 0.0130475302889096, "grad_norm": 15.835498075147397, "learning_rate": 8.695652173913044e-07, "loss": 1.9277, "step": 14 }, { "epoch": 0.013979496738117428, "grad_norm": 16.201076928679914, "learning_rate": 9.316770186335404e-07, "loss": 1.8863, "step": 15 }, { "epoch": 0.014911463187325256, "grad_norm": 15.849335645836712, "learning_rate": 9.937888198757765e-07, "loss": 1.9487, "step": 16 }, { "epoch": 0.015843429636533086, "grad_norm": 15.771178607599738, "learning_rate": 1.0559006211180126e-06, "loss": 1.8885, "step": 17 }, { "epoch": 0.016775396085740912, "grad_norm": 14.348111968340568, "learning_rate": 1.1180124223602485e-06, "loss": 1.834, "step": 18 }, { "epoch": 0.017707362534948742, "grad_norm": 14.37908745535962, "learning_rate": 1.1801242236024846e-06, "loss": 1.8349, "step": 19 }, { "epoch": 0.01863932898415657, "grad_norm": 14.552085861900753, "learning_rate": 1.2422360248447205e-06, "loss": 1.7699, "step": 20 }, { "epoch": 0.0195712954333644, "grad_norm": 13.394106211620324, "learning_rate": 1.3043478260869566e-06, "loss": 1.7666, "step": 21 }, { "epoch": 0.02050326188257223, "grad_norm": 14.029169418758917, "learning_rate": 1.3664596273291927e-06, "loss": 1.7241, "step": 22 }, { "epoch": 0.021435228331780055, "grad_norm": 14.096299792421728, "learning_rate": 1.4285714285714286e-06, "loss": 1.7387, "step": 23 }, { "epoch": 0.022367194780987885, "grad_norm": 10.804463169195458, "learning_rate": 1.4906832298136647e-06, "loss": 1.618, "step": 24 }, { "epoch": 0.023299161230195712, "grad_norm": 10.05222389025309, "learning_rate": 1.5527950310559006e-06, "loss": 1.5072, "step": 25 }, { "epoch": 0.024231127679403542, "grad_norm": 10.296624474336681, "learning_rate": 1.6149068322981367e-06, "loss": 1.528, "step": 26 }, { "epoch": 0.02516309412861137, "grad_norm": 9.29736651526317, "learning_rate": 1.6770186335403729e-06, "loss": 1.4862, "step": 27 }, { "epoch": 0.0260950605778192, "grad_norm": 9.582431342654145, "learning_rate": 1.7391304347826088e-06, "loss": 1.4776, "step": 28 }, { "epoch": 0.02702702702702703, "grad_norm": 8.965144193432577, "learning_rate": 1.8012422360248449e-06, "loss": 1.4371, "step": 29 }, { "epoch": 0.027958993476234855, "grad_norm": 8.630727076899953, "learning_rate": 1.8633540372670808e-06, "loss": 1.4295, "step": 30 }, { "epoch": 0.028890959925442685, "grad_norm": 8.308760420151225, "learning_rate": 1.925465838509317e-06, "loss": 1.4286, "step": 31 }, { "epoch": 0.02982292637465051, "grad_norm": 7.938322316422349, "learning_rate": 1.987577639751553e-06, "loss": 1.3926, "step": 32 }, { "epoch": 0.03075489282385834, "grad_norm": 4.637445525164372, "learning_rate": 2.049689440993789e-06, "loss": 1.198, "step": 33 }, { "epoch": 0.03168685927306617, "grad_norm": 3.4798666909452036, "learning_rate": 2.111801242236025e-06, "loss": 1.152, "step": 34 }, { "epoch": 0.032618825722273995, "grad_norm": 3.434372950276135, "learning_rate": 2.173913043478261e-06, "loss": 1.2025, "step": 35 }, { "epoch": 0.033550792171481825, "grad_norm": 3.217794056246653, "learning_rate": 2.236024844720497e-06, "loss": 1.1453, "step": 36 }, { "epoch": 0.034482758620689655, "grad_norm": 3.1843545768266086, "learning_rate": 2.298136645962733e-06, "loss": 1.161, "step": 37 }, { "epoch": 0.035414725069897485, "grad_norm": 3.083362455172546, "learning_rate": 2.3602484472049692e-06, "loss": 1.1678, "step": 38 }, { "epoch": 0.036346691519105315, "grad_norm": 2.7695915611563553, "learning_rate": 2.422360248447205e-06, "loss": 1.1229, "step": 39 }, { "epoch": 0.03727865796831314, "grad_norm": 2.6777546011019466, "learning_rate": 2.484472049689441e-06, "loss": 1.1627, "step": 40 }, { "epoch": 0.03821062441752097, "grad_norm": 2.574135393001452, "learning_rate": 2.546583850931677e-06, "loss": 1.0625, "step": 41 }, { "epoch": 0.0391425908667288, "grad_norm": 2.3248188431386723, "learning_rate": 2.6086956521739132e-06, "loss": 1.1436, "step": 42 }, { "epoch": 0.04007455731593663, "grad_norm": 2.0020566845509005, "learning_rate": 2.670807453416149e-06, "loss": 1.0708, "step": 43 }, { "epoch": 0.04100652376514446, "grad_norm": 1.677951795141057, "learning_rate": 2.7329192546583855e-06, "loss": 0.9854, "step": 44 }, { "epoch": 0.04193849021435228, "grad_norm": 1.5827972518568916, "learning_rate": 2.795031055900621e-06, "loss": 1.0394, "step": 45 }, { "epoch": 0.04287045666356011, "grad_norm": 1.3457125138234156, "learning_rate": 2.8571428571428573e-06, "loss": 0.9728, "step": 46 }, { "epoch": 0.04380242311276794, "grad_norm": 1.3094324040862078, "learning_rate": 2.919254658385093e-06, "loss": 1.0177, "step": 47 }, { "epoch": 0.04473438956197577, "grad_norm": 1.17847590780011, "learning_rate": 2.9813664596273295e-06, "loss": 0.9754, "step": 48 }, { "epoch": 0.045666356011183594, "grad_norm": 1.1090764165980118, "learning_rate": 3.043478260869566e-06, "loss": 0.9469, "step": 49 }, { "epoch": 0.046598322460391424, "grad_norm": 0.9974617696857423, "learning_rate": 3.1055900621118013e-06, "loss": 0.8788, "step": 50 }, { "epoch": 0.047530288909599254, "grad_norm": 1.185846079369585, "learning_rate": 3.1677018633540376e-06, "loss": 0.8855, "step": 51 }, { "epoch": 0.048462255358807084, "grad_norm": 0.9844763294604671, "learning_rate": 3.2298136645962735e-06, "loss": 0.8996, "step": 52 }, { "epoch": 0.049394221808014914, "grad_norm": 0.9705901082754779, "learning_rate": 3.29192546583851e-06, "loss": 0.9104, "step": 53 }, { "epoch": 0.05032618825722274, "grad_norm": 0.8203989351369354, "learning_rate": 3.3540372670807457e-06, "loss": 0.8736, "step": 54 }, { "epoch": 0.05125815470643057, "grad_norm": 0.8578851846838254, "learning_rate": 3.4161490683229816e-06, "loss": 0.9486, "step": 55 }, { "epoch": 0.0521901211556384, "grad_norm": 0.7774548669260155, "learning_rate": 3.4782608695652175e-06, "loss": 0.8468, "step": 56 }, { "epoch": 0.05312208760484623, "grad_norm": 0.6913299678275155, "learning_rate": 3.540372670807454e-06, "loss": 0.8479, "step": 57 }, { "epoch": 0.05405405405405406, "grad_norm": 0.7386078804137914, "learning_rate": 3.6024844720496897e-06, "loss": 0.8618, "step": 58 }, { "epoch": 0.05498602050326188, "grad_norm": 0.638313026719666, "learning_rate": 3.664596273291926e-06, "loss": 0.8005, "step": 59 }, { "epoch": 0.05591798695246971, "grad_norm": 0.530638256032114, "learning_rate": 3.7267080745341615e-06, "loss": 0.8157, "step": 60 }, { "epoch": 0.05684995340167754, "grad_norm": 0.5414644892096061, "learning_rate": 3.788819875776398e-06, "loss": 0.8353, "step": 61 }, { "epoch": 0.05778191985088537, "grad_norm": 0.516866019529983, "learning_rate": 3.850931677018634e-06, "loss": 0.8182, "step": 62 }, { "epoch": 0.05871388630009319, "grad_norm": 0.5041309291687108, "learning_rate": 3.91304347826087e-06, "loss": 0.7732, "step": 63 }, { "epoch": 0.05964585274930102, "grad_norm": 0.5329256730456686, "learning_rate": 3.975155279503106e-06, "loss": 0.8093, "step": 64 }, { "epoch": 0.06057781919850885, "grad_norm": 0.5291267883428311, "learning_rate": 4.037267080745342e-06, "loss": 0.735, "step": 65 }, { "epoch": 0.06150978564771668, "grad_norm": 0.42902903147801047, "learning_rate": 4.099378881987578e-06, "loss": 0.7972, "step": 66 }, { "epoch": 0.06244175209692451, "grad_norm": 0.4434368526625587, "learning_rate": 4.1614906832298145e-06, "loss": 0.7946, "step": 67 }, { "epoch": 0.06337371854613234, "grad_norm": 0.40100780841480027, "learning_rate": 4.22360248447205e-06, "loss": 0.7592, "step": 68 }, { "epoch": 0.06430568499534017, "grad_norm": 0.42791115896402737, "learning_rate": 4.2857142857142855e-06, "loss": 0.7565, "step": 69 }, { "epoch": 0.06523765144454799, "grad_norm": 0.40529862187993165, "learning_rate": 4.347826086956522e-06, "loss": 0.7613, "step": 70 }, { "epoch": 0.06616961789375582, "grad_norm": 0.3933474947285965, "learning_rate": 4.409937888198758e-06, "loss": 0.7588, "step": 71 }, { "epoch": 0.06710158434296365, "grad_norm": 0.3527051747688975, "learning_rate": 4.472049689440994e-06, "loss": 0.7659, "step": 72 }, { "epoch": 0.06803355079217148, "grad_norm": 0.35990675171637626, "learning_rate": 4.534161490683231e-06, "loss": 0.741, "step": 73 }, { "epoch": 0.06896551724137931, "grad_norm": 0.39561500645235886, "learning_rate": 4.596273291925466e-06, "loss": 0.7274, "step": 74 }, { "epoch": 0.06989748369058714, "grad_norm": 0.3444922120890626, "learning_rate": 4.6583850931677025e-06, "loss": 0.7402, "step": 75 }, { "epoch": 0.07082945013979497, "grad_norm": 0.3755730382383842, "learning_rate": 4.7204968944099384e-06, "loss": 0.756, "step": 76 }, { "epoch": 0.0717614165890028, "grad_norm": 0.332763528688733, "learning_rate": 4.782608695652174e-06, "loss": 0.7556, "step": 77 }, { "epoch": 0.07269338303821063, "grad_norm": 0.2999094093474444, "learning_rate": 4.84472049689441e-06, "loss": 0.7068, "step": 78 }, { "epoch": 0.07362534948741846, "grad_norm": 0.2913764586916152, "learning_rate": 4.906832298136646e-06, "loss": 0.6953, "step": 79 }, { "epoch": 0.07455731593662628, "grad_norm": 0.3729909644093024, "learning_rate": 4.968944099378882e-06, "loss": 0.7525, "step": 80 }, { "epoch": 0.0754892823858341, "grad_norm": 0.3204357072596035, "learning_rate": 5.031055900621118e-06, "loss": 0.732, "step": 81 }, { "epoch": 0.07642124883504194, "grad_norm": 0.3320873901978301, "learning_rate": 5.093167701863354e-06, "loss": 0.7058, "step": 82 }, { "epoch": 0.07735321528424977, "grad_norm": 0.2832921910793618, "learning_rate": 5.155279503105591e-06, "loss": 0.7069, "step": 83 }, { "epoch": 0.0782851817334576, "grad_norm": 0.2924938506664702, "learning_rate": 5.2173913043478265e-06, "loss": 0.7171, "step": 84 }, { "epoch": 0.07921714818266543, "grad_norm": 0.31060995115862167, "learning_rate": 5.279503105590062e-06, "loss": 0.7101, "step": 85 }, { "epoch": 0.08014911463187326, "grad_norm": 0.2825954781730592, "learning_rate": 5.341614906832298e-06, "loss": 0.7299, "step": 86 }, { "epoch": 0.08108108108108109, "grad_norm": 0.26257407993390564, "learning_rate": 5.403726708074535e-06, "loss": 0.7276, "step": 87 }, { "epoch": 0.08201304753028892, "grad_norm": 0.2647355359474867, "learning_rate": 5.465838509316771e-06, "loss": 0.732, "step": 88 }, { "epoch": 0.08294501397949673, "grad_norm": 0.6874409201849152, "learning_rate": 5.527950310559007e-06, "loss": 0.7548, "step": 89 }, { "epoch": 0.08387698042870456, "grad_norm": 0.28979530358584626, "learning_rate": 5.590062111801242e-06, "loss": 0.6593, "step": 90 }, { "epoch": 0.08480894687791239, "grad_norm": 0.26511507446048627, "learning_rate": 5.652173913043479e-06, "loss": 0.7126, "step": 91 }, { "epoch": 0.08574091332712022, "grad_norm": 0.30233045091914007, "learning_rate": 5.7142857142857145e-06, "loss": 0.6957, "step": 92 }, { "epoch": 0.08667287977632805, "grad_norm": 0.27364104417213153, "learning_rate": 5.77639751552795e-06, "loss": 0.6986, "step": 93 }, { "epoch": 0.08760484622553588, "grad_norm": 0.24184333856163384, "learning_rate": 5.838509316770186e-06, "loss": 0.6938, "step": 94 }, { "epoch": 0.08853681267474371, "grad_norm": 0.2574142718698492, "learning_rate": 5.900621118012423e-06, "loss": 0.6785, "step": 95 }, { "epoch": 0.08946877912395154, "grad_norm": 0.3584165169045767, "learning_rate": 5.962732919254659e-06, "loss": 0.7146, "step": 96 }, { "epoch": 0.09040074557315937, "grad_norm": 0.24230615304583505, "learning_rate": 6.024844720496895e-06, "loss": 0.6329, "step": 97 }, { "epoch": 0.09133271202236719, "grad_norm": 0.2507239958030005, "learning_rate": 6.086956521739132e-06, "loss": 0.6535, "step": 98 }, { "epoch": 0.09226467847157502, "grad_norm": 0.26323197350540634, "learning_rate": 6.1490683229813675e-06, "loss": 0.6964, "step": 99 }, { "epoch": 0.09319664492078285, "grad_norm": 0.2599048992047766, "learning_rate": 6.2111801242236025e-06, "loss": 0.6906, "step": 100 }, { "epoch": 0.09412861136999068, "grad_norm": 0.3023924371768309, "learning_rate": 6.2732919254658384e-06, "loss": 0.6763, "step": 101 }, { "epoch": 0.09506057781919851, "grad_norm": 0.22078410369513882, "learning_rate": 6.335403726708075e-06, "loss": 0.6575, "step": 102 }, { "epoch": 0.09599254426840634, "grad_norm": 0.2485988296311015, "learning_rate": 6.397515527950311e-06, "loss": 0.6827, "step": 103 }, { "epoch": 0.09692451071761417, "grad_norm": 0.21747631471901122, "learning_rate": 6.459627329192547e-06, "loss": 0.674, "step": 104 }, { "epoch": 0.097856477166822, "grad_norm": 0.20686933334840713, "learning_rate": 6.521739130434783e-06, "loss": 0.6413, "step": 105 }, { "epoch": 0.09878844361602983, "grad_norm": 0.2287629504013785, "learning_rate": 6.58385093167702e-06, "loss": 0.6665, "step": 106 }, { "epoch": 0.09972041006523766, "grad_norm": 0.2062079574933486, "learning_rate": 6.6459627329192555e-06, "loss": 0.6474, "step": 107 }, { "epoch": 0.10065237651444547, "grad_norm": 0.2723370986464238, "learning_rate": 6.7080745341614914e-06, "loss": 0.7034, "step": 108 }, { "epoch": 0.1015843429636533, "grad_norm": 0.24228726385575017, "learning_rate": 6.7701863354037265e-06, "loss": 0.6428, "step": 109 }, { "epoch": 0.10251630941286113, "grad_norm": 0.23438881555949534, "learning_rate": 6.832298136645963e-06, "loss": 0.6745, "step": 110 }, { "epoch": 0.10344827586206896, "grad_norm": 0.21499015619570377, "learning_rate": 6.894409937888199e-06, "loss": 0.6463, "step": 111 }, { "epoch": 0.1043802423112768, "grad_norm": 0.235031196265718, "learning_rate": 6.956521739130435e-06, "loss": 0.7052, "step": 112 }, { "epoch": 0.10531220876048462, "grad_norm": 0.20902549183133895, "learning_rate": 7.018633540372671e-06, "loss": 0.6782, "step": 113 }, { "epoch": 0.10624417520969245, "grad_norm": 0.19652201990353677, "learning_rate": 7.080745341614908e-06, "loss": 0.6333, "step": 114 }, { "epoch": 0.10717614165890028, "grad_norm": 0.21589933831464472, "learning_rate": 7.1428571428571436e-06, "loss": 0.6397, "step": 115 }, { "epoch": 0.10810810810810811, "grad_norm": 0.2042115275764884, "learning_rate": 7.2049689440993795e-06, "loss": 0.6406, "step": 116 }, { "epoch": 0.10904007455731593, "grad_norm": 0.20964634840751376, "learning_rate": 7.267080745341616e-06, "loss": 0.6459, "step": 117 }, { "epoch": 0.10997204100652376, "grad_norm": 0.7411918404279729, "learning_rate": 7.329192546583852e-06, "loss": 0.6575, "step": 118 }, { "epoch": 0.11090400745573159, "grad_norm": 0.32978183435117453, "learning_rate": 7.391304347826087e-06, "loss": 0.6454, "step": 119 }, { "epoch": 0.11183597390493942, "grad_norm": 0.2045501712723584, "learning_rate": 7.453416149068323e-06, "loss": 0.6388, "step": 120 }, { "epoch": 0.11276794035414725, "grad_norm": 0.21273708031291497, "learning_rate": 7.515527950310559e-06, "loss": 0.6667, "step": 121 }, { "epoch": 0.11369990680335508, "grad_norm": 0.2576947553479958, "learning_rate": 7.577639751552796e-06, "loss": 0.6632, "step": 122 }, { "epoch": 0.11463187325256291, "grad_norm": 0.19803704979095446, "learning_rate": 7.639751552795032e-06, "loss": 0.6195, "step": 123 }, { "epoch": 0.11556383970177074, "grad_norm": 0.1963313535139179, "learning_rate": 7.701863354037268e-06, "loss": 0.6333, "step": 124 }, { "epoch": 0.11649580615097857, "grad_norm": 0.1897816574783627, "learning_rate": 7.763975155279503e-06, "loss": 0.6051, "step": 125 }, { "epoch": 0.11742777260018639, "grad_norm": 0.19572172962919804, "learning_rate": 7.82608695652174e-06, "loss": 0.6498, "step": 126 }, { "epoch": 0.11835973904939422, "grad_norm": 0.21880047083522808, "learning_rate": 7.888198757763977e-06, "loss": 0.6534, "step": 127 }, { "epoch": 0.11929170549860205, "grad_norm": 0.24980450844847382, "learning_rate": 7.950310559006212e-06, "loss": 0.6425, "step": 128 }, { "epoch": 0.12022367194780988, "grad_norm": 0.28674739950354056, "learning_rate": 8.012422360248447e-06, "loss": 0.6316, "step": 129 }, { "epoch": 0.1211556383970177, "grad_norm": 0.20615235218718322, "learning_rate": 8.074534161490684e-06, "loss": 0.6421, "step": 130 }, { "epoch": 0.12208760484622554, "grad_norm": 0.23381531474112316, "learning_rate": 8.13664596273292e-06, "loss": 0.6233, "step": 131 }, { "epoch": 0.12301957129543337, "grad_norm": 0.3001504489663318, "learning_rate": 8.198757763975156e-06, "loss": 0.6598, "step": 132 }, { "epoch": 0.1239515377446412, "grad_norm": 0.2173405311182925, "learning_rate": 8.260869565217392e-06, "loss": 0.6493, "step": 133 }, { "epoch": 0.12488350419384903, "grad_norm": 0.24977982351253636, "learning_rate": 8.322981366459629e-06, "loss": 0.6776, "step": 134 }, { "epoch": 0.12581547064305684, "grad_norm": 0.19727826443905885, "learning_rate": 8.385093167701864e-06, "loss": 0.6102, "step": 135 }, { "epoch": 0.1267474370922647, "grad_norm": 0.18692371042484018, "learning_rate": 8.4472049689441e-06, "loss": 0.6132, "step": 136 }, { "epoch": 0.1276794035414725, "grad_norm": 0.2235962202951247, "learning_rate": 8.509316770186336e-06, "loss": 0.622, "step": 137 }, { "epoch": 0.12861136999068035, "grad_norm": 0.23174818075498654, "learning_rate": 8.571428571428571e-06, "loss": 0.6384, "step": 138 }, { "epoch": 0.12954333643988816, "grad_norm": 0.19123866023121291, "learning_rate": 8.633540372670808e-06, "loss": 0.6076, "step": 139 }, { "epoch": 0.13047530288909598, "grad_norm": 0.19424539303942895, "learning_rate": 8.695652173913044e-06, "loss": 0.6337, "step": 140 }, { "epoch": 0.13140726933830382, "grad_norm": 0.22953912427100304, "learning_rate": 8.75776397515528e-06, "loss": 0.6849, "step": 141 }, { "epoch": 0.13233923578751164, "grad_norm": 0.20784126067583292, "learning_rate": 8.819875776397516e-06, "loss": 0.6369, "step": 142 }, { "epoch": 0.13327120223671948, "grad_norm": 0.21517676009004955, "learning_rate": 8.881987577639753e-06, "loss": 0.6136, "step": 143 }, { "epoch": 0.1342031686859273, "grad_norm": 0.25916862102161575, "learning_rate": 8.944099378881988e-06, "loss": 0.6773, "step": 144 }, { "epoch": 0.13513513513513514, "grad_norm": 0.20140896332345345, "learning_rate": 9.006211180124225e-06, "loss": 0.6477, "step": 145 }, { "epoch": 0.13606710158434296, "grad_norm": 0.20496638077649088, "learning_rate": 9.068322981366461e-06, "loss": 0.6304, "step": 146 }, { "epoch": 0.1369990680335508, "grad_norm": 0.29226685279002573, "learning_rate": 9.130434782608697e-06, "loss": 0.6394, "step": 147 }, { "epoch": 0.13793103448275862, "grad_norm": 0.23330464136590923, "learning_rate": 9.192546583850932e-06, "loss": 0.5798, "step": 148 }, { "epoch": 0.13886300093196646, "grad_norm": 0.23725741360710417, "learning_rate": 9.254658385093168e-06, "loss": 0.599, "step": 149 }, { "epoch": 0.13979496738117428, "grad_norm": 0.20479157914643706, "learning_rate": 9.316770186335405e-06, "loss": 0.6643, "step": 150 }, { "epoch": 0.1407269338303821, "grad_norm": 0.22737918562456944, "learning_rate": 9.37888198757764e-06, "loss": 0.6545, "step": 151 }, { "epoch": 0.14165890027958994, "grad_norm": 0.22606131761134923, "learning_rate": 9.440993788819877e-06, "loss": 0.6378, "step": 152 }, { "epoch": 0.14259086672879775, "grad_norm": 0.21100147648927023, "learning_rate": 9.503105590062112e-06, "loss": 0.6202, "step": 153 }, { "epoch": 0.1435228331780056, "grad_norm": 0.2017246401862002, "learning_rate": 9.565217391304349e-06, "loss": 0.6371, "step": 154 }, { "epoch": 0.14445479962721341, "grad_norm": 0.21766955847130265, "learning_rate": 9.627329192546585e-06, "loss": 0.6087, "step": 155 }, { "epoch": 0.14538676607642126, "grad_norm": 0.19998937391175284, "learning_rate": 9.68944099378882e-06, "loss": 0.5961, "step": 156 }, { "epoch": 0.14631873252562907, "grad_norm": 0.19873452444518203, "learning_rate": 9.751552795031056e-06, "loss": 0.5955, "step": 157 }, { "epoch": 0.14725069897483692, "grad_norm": 0.1833247823807435, "learning_rate": 9.813664596273292e-06, "loss": 0.6244, "step": 158 }, { "epoch": 0.14818266542404473, "grad_norm": 0.2194885064122921, "learning_rate": 9.875776397515529e-06, "loss": 0.6153, "step": 159 }, { "epoch": 0.14911463187325255, "grad_norm": 0.2012303744267507, "learning_rate": 9.937888198757764e-06, "loss": 0.6009, "step": 160 }, { "epoch": 0.1500465983224604, "grad_norm": 0.18994710598440667, "learning_rate": 1e-05, "loss": 0.6, "step": 161 }, { "epoch": 0.1509785647716682, "grad_norm": 0.24427250929528857, "learning_rate": 1.0062111801242236e-05, "loss": 0.6095, "step": 162 }, { "epoch": 0.15191053122087605, "grad_norm": 0.23232211445187412, "learning_rate": 1.0124223602484473e-05, "loss": 0.6205, "step": 163 }, { "epoch": 0.15284249767008387, "grad_norm": 0.22725500637251334, "learning_rate": 1.0186335403726708e-05, "loss": 0.6419, "step": 164 }, { "epoch": 0.15377446411929171, "grad_norm": 0.21022670016460035, "learning_rate": 1.0248447204968946e-05, "loss": 0.6289, "step": 165 }, { "epoch": 0.15470643056849953, "grad_norm": 0.18626308155529395, "learning_rate": 1.0310559006211181e-05, "loss": 0.6113, "step": 166 }, { "epoch": 0.15563839701770738, "grad_norm": 0.20206242692800191, "learning_rate": 1.0372670807453418e-05, "loss": 0.5884, "step": 167 }, { "epoch": 0.1565703634669152, "grad_norm": 0.20455779100136276, "learning_rate": 1.0434782608695653e-05, "loss": 0.6086, "step": 168 }, { "epoch": 0.157502329916123, "grad_norm": 0.20902639677351295, "learning_rate": 1.049689440993789e-05, "loss": 0.5627, "step": 169 }, { "epoch": 0.15843429636533085, "grad_norm": 0.17653790312654627, "learning_rate": 1.0559006211180125e-05, "loss": 0.5802, "step": 170 }, { "epoch": 0.15936626281453867, "grad_norm": 0.2527145282687491, "learning_rate": 1.062111801242236e-05, "loss": 0.5994, "step": 171 }, { "epoch": 0.1602982292637465, "grad_norm": 0.19982140057554743, "learning_rate": 1.0683229813664597e-05, "loss": 0.5929, "step": 172 }, { "epoch": 0.16123019571295433, "grad_norm": 0.19966536003507998, "learning_rate": 1.0745341614906832e-05, "loss": 0.646, "step": 173 }, { "epoch": 0.16216216216216217, "grad_norm": 0.18570753320404712, "learning_rate": 1.080745341614907e-05, "loss": 0.578, "step": 174 }, { "epoch": 0.16309412861137, "grad_norm": 0.2581134283822079, "learning_rate": 1.0869565217391305e-05, "loss": 0.6019, "step": 175 }, { "epoch": 0.16402609506057783, "grad_norm": 0.1886167624653722, "learning_rate": 1.0931677018633542e-05, "loss": 0.5785, "step": 176 }, { "epoch": 0.16495806150978565, "grad_norm": 0.18780402956873624, "learning_rate": 1.0993788819875777e-05, "loss": 0.581, "step": 177 }, { "epoch": 0.16589002795899346, "grad_norm": 0.21590374200987986, "learning_rate": 1.1055900621118014e-05, "loss": 0.5912, "step": 178 }, { "epoch": 0.1668219944082013, "grad_norm": 0.1980009064972523, "learning_rate": 1.1118012422360249e-05, "loss": 0.6247, "step": 179 }, { "epoch": 0.16775396085740912, "grad_norm": 0.18471078637015056, "learning_rate": 1.1180124223602484e-05, "loss": 0.5987, "step": 180 }, { "epoch": 0.16868592730661697, "grad_norm": 0.30878749020200075, "learning_rate": 1.1242236024844722e-05, "loss": 0.5788, "step": 181 }, { "epoch": 0.16961789375582478, "grad_norm": 0.2652716421797194, "learning_rate": 1.1304347826086957e-05, "loss": 0.6431, "step": 182 }, { "epoch": 0.17054986020503263, "grad_norm": 0.17821243092039143, "learning_rate": 1.1366459627329194e-05, "loss": 0.575, "step": 183 }, { "epoch": 0.17148182665424044, "grad_norm": 0.18829698182441362, "learning_rate": 1.1428571428571429e-05, "loss": 0.595, "step": 184 }, { "epoch": 0.1724137931034483, "grad_norm": 0.2283921971388746, "learning_rate": 1.1490683229813666e-05, "loss": 0.6052, "step": 185 }, { "epoch": 0.1733457595526561, "grad_norm": 0.1782676238933828, "learning_rate": 1.15527950310559e-05, "loss": 0.5988, "step": 186 }, { "epoch": 0.17427772600186392, "grad_norm": 0.18686159583322093, "learning_rate": 1.161490683229814e-05, "loss": 0.5985, "step": 187 }, { "epoch": 0.17520969245107176, "grad_norm": 0.19559666436894627, "learning_rate": 1.1677018633540373e-05, "loss": 0.584, "step": 188 }, { "epoch": 0.17614165890027958, "grad_norm": 0.1896042325321541, "learning_rate": 1.1739130434782611e-05, "loss": 0.6475, "step": 189 }, { "epoch": 0.17707362534948742, "grad_norm": 0.18201911340097623, "learning_rate": 1.1801242236024846e-05, "loss": 0.615, "step": 190 }, { "epoch": 0.17800559179869524, "grad_norm": 0.23889308212982904, "learning_rate": 1.1863354037267081e-05, "loss": 0.6069, "step": 191 }, { "epoch": 0.17893755824790308, "grad_norm": 0.1852596983561888, "learning_rate": 1.1925465838509318e-05, "loss": 0.5648, "step": 192 }, { "epoch": 0.1798695246971109, "grad_norm": 0.19269275937616784, "learning_rate": 1.1987577639751553e-05, "loss": 0.5927, "step": 193 }, { "epoch": 0.18080149114631874, "grad_norm": 0.23385755931362112, "learning_rate": 1.204968944099379e-05, "loss": 0.6148, "step": 194 }, { "epoch": 0.18173345759552656, "grad_norm": 0.19622790004948765, "learning_rate": 1.2111801242236025e-05, "loss": 0.6043, "step": 195 }, { "epoch": 0.18266542404473438, "grad_norm": 0.1970196079797927, "learning_rate": 1.2173913043478263e-05, "loss": 0.5995, "step": 196 }, { "epoch": 0.18359739049394222, "grad_norm": 0.22971357759703187, "learning_rate": 1.2236024844720498e-05, "loss": 0.6215, "step": 197 }, { "epoch": 0.18452935694315004, "grad_norm": 0.19121359022823675, "learning_rate": 1.2298136645962735e-05, "loss": 0.5857, "step": 198 }, { "epoch": 0.18546132339235788, "grad_norm": 0.18343087309929315, "learning_rate": 1.236024844720497e-05, "loss": 0.6126, "step": 199 }, { "epoch": 0.1863932898415657, "grad_norm": 0.1806355397940237, "learning_rate": 1.2422360248447205e-05, "loss": 0.5712, "step": 200 }, { "epoch": 0.18732525629077354, "grad_norm": 0.1802932740005755, "learning_rate": 1.2484472049689442e-05, "loss": 0.5849, "step": 201 }, { "epoch": 0.18825722273998136, "grad_norm": 0.20563925013828346, "learning_rate": 1.2546583850931677e-05, "loss": 0.5717, "step": 202 }, { "epoch": 0.1891891891891892, "grad_norm": 0.18712651892688753, "learning_rate": 1.2608695652173915e-05, "loss": 0.5655, "step": 203 }, { "epoch": 0.19012115563839702, "grad_norm": 0.18374968395440877, "learning_rate": 1.267080745341615e-05, "loss": 0.5994, "step": 204 }, { "epoch": 0.19105312208760486, "grad_norm": 0.19613517259405855, "learning_rate": 1.2732919254658387e-05, "loss": 0.5832, "step": 205 }, { "epoch": 0.19198508853681268, "grad_norm": 0.23426939635539784, "learning_rate": 1.2795031055900622e-05, "loss": 0.6341, "step": 206 }, { "epoch": 0.1929170549860205, "grad_norm": 0.1839144663594488, "learning_rate": 1.2857142857142859e-05, "loss": 0.5951, "step": 207 }, { "epoch": 0.19384902143522834, "grad_norm": 0.19003505069566956, "learning_rate": 1.2919254658385094e-05, "loss": 0.5892, "step": 208 }, { "epoch": 0.19478098788443615, "grad_norm": 0.177289062805138, "learning_rate": 1.2981366459627329e-05, "loss": 0.5641, "step": 209 }, { "epoch": 0.195712954333644, "grad_norm": 0.1895576052739334, "learning_rate": 1.3043478260869566e-05, "loss": 0.6077, "step": 210 }, { "epoch": 0.1966449207828518, "grad_norm": 0.24705983933197062, "learning_rate": 1.31055900621118e-05, "loss": 0.6177, "step": 211 }, { "epoch": 0.19757688723205966, "grad_norm": 0.208444844373892, "learning_rate": 1.316770186335404e-05, "loss": 0.6026, "step": 212 }, { "epoch": 0.19850885368126747, "grad_norm": 0.20932317391931282, "learning_rate": 1.3229813664596274e-05, "loss": 0.5646, "step": 213 }, { "epoch": 0.19944082013047532, "grad_norm": 0.20796258019999062, "learning_rate": 1.3291925465838511e-05, "loss": 0.5798, "step": 214 }, { "epoch": 0.20037278657968313, "grad_norm": 0.1909128758935929, "learning_rate": 1.3354037267080746e-05, "loss": 0.5356, "step": 215 }, { "epoch": 0.20130475302889095, "grad_norm": 0.17577890646286617, "learning_rate": 1.3416149068322983e-05, "loss": 0.5597, "step": 216 }, { "epoch": 0.2022367194780988, "grad_norm": 0.24964804484604822, "learning_rate": 1.3478260869565218e-05, "loss": 0.5579, "step": 217 }, { "epoch": 0.2031686859273066, "grad_norm": 0.2029053120406279, "learning_rate": 1.3540372670807453e-05, "loss": 0.6235, "step": 218 }, { "epoch": 0.20410065237651445, "grad_norm": 0.173865607318472, "learning_rate": 1.3602484472049691e-05, "loss": 0.5601, "step": 219 }, { "epoch": 0.20503261882572227, "grad_norm": 0.1998337129961261, "learning_rate": 1.3664596273291926e-05, "loss": 0.5999, "step": 220 }, { "epoch": 0.2059645852749301, "grad_norm": 0.19416038279833958, "learning_rate": 1.3726708074534163e-05, "loss": 0.5606, "step": 221 }, { "epoch": 0.20689655172413793, "grad_norm": 0.1868094523994442, "learning_rate": 1.3788819875776398e-05, "loss": 0.5614, "step": 222 }, { "epoch": 0.20782851817334577, "grad_norm": 0.20155153597276104, "learning_rate": 1.3850931677018635e-05, "loss": 0.5558, "step": 223 }, { "epoch": 0.2087604846225536, "grad_norm": 0.16422175560594163, "learning_rate": 1.391304347826087e-05, "loss": 0.543, "step": 224 }, { "epoch": 0.2096924510717614, "grad_norm": 0.19287316294145215, "learning_rate": 1.3975155279503107e-05, "loss": 0.5787, "step": 225 }, { "epoch": 0.21062441752096925, "grad_norm": 0.19110625768781372, "learning_rate": 1.4037267080745342e-05, "loss": 0.5538, "step": 226 }, { "epoch": 0.21155638397017706, "grad_norm": 0.1915416166199141, "learning_rate": 1.409937888198758e-05, "loss": 0.5955, "step": 227 }, { "epoch": 0.2124883504193849, "grad_norm": 0.17645556056338366, "learning_rate": 1.4161490683229815e-05, "loss": 0.5509, "step": 228 }, { "epoch": 0.21342031686859272, "grad_norm": 0.1923230323541307, "learning_rate": 1.422360248447205e-05, "loss": 0.5972, "step": 229 }, { "epoch": 0.21435228331780057, "grad_norm": 0.1765165368441566, "learning_rate": 1.4285714285714287e-05, "loss": 0.5748, "step": 230 }, { "epoch": 0.21528424976700838, "grad_norm": 0.1829703164092614, "learning_rate": 1.4347826086956522e-05, "loss": 0.5663, "step": 231 }, { "epoch": 0.21621621621621623, "grad_norm": 0.20452970266398846, "learning_rate": 1.4409937888198759e-05, "loss": 0.5899, "step": 232 }, { "epoch": 0.21714818266542404, "grad_norm": 0.18703783106852623, "learning_rate": 1.4472049689440994e-05, "loss": 0.5977, "step": 233 }, { "epoch": 0.21808014911463186, "grad_norm": 0.1888377962286148, "learning_rate": 1.4534161490683232e-05, "loss": 0.5656, "step": 234 }, { "epoch": 0.2190121155638397, "grad_norm": 0.1740886406538699, "learning_rate": 1.4596273291925467e-05, "loss": 0.5426, "step": 235 }, { "epoch": 0.21994408201304752, "grad_norm": 0.20690308289580275, "learning_rate": 1.4658385093167704e-05, "loss": 0.5781, "step": 236 }, { "epoch": 0.22087604846225536, "grad_norm": 0.17692637119143279, "learning_rate": 1.472049689440994e-05, "loss": 0.5724, "step": 237 }, { "epoch": 0.22180801491146318, "grad_norm": 0.18223871962948435, "learning_rate": 1.4782608695652174e-05, "loss": 0.5875, "step": 238 }, { "epoch": 0.22273998136067102, "grad_norm": 0.1740419525783155, "learning_rate": 1.4844720496894411e-05, "loss": 0.5358, "step": 239 }, { "epoch": 0.22367194780987884, "grad_norm": 0.20502175298422767, "learning_rate": 1.4906832298136646e-05, "loss": 0.5631, "step": 240 }, { "epoch": 0.22460391425908668, "grad_norm": 0.17874781813718815, "learning_rate": 1.4968944099378885e-05, "loss": 0.5787, "step": 241 }, { "epoch": 0.2255358807082945, "grad_norm": 0.17247816510925784, "learning_rate": 1.5031055900621118e-05, "loss": 0.5616, "step": 242 }, { "epoch": 0.22646784715750232, "grad_norm": 0.213222699074415, "learning_rate": 1.5093167701863356e-05, "loss": 0.5859, "step": 243 }, { "epoch": 0.22739981360671016, "grad_norm": 0.17529094935932302, "learning_rate": 1.5155279503105591e-05, "loss": 0.5577, "step": 244 }, { "epoch": 0.22833178005591798, "grad_norm": 0.17945016167030883, "learning_rate": 1.5217391304347828e-05, "loss": 0.5807, "step": 245 }, { "epoch": 0.22926374650512582, "grad_norm": 0.17064354730149425, "learning_rate": 1.5279503105590063e-05, "loss": 0.5458, "step": 246 }, { "epoch": 0.23019571295433364, "grad_norm": 0.16907826996133896, "learning_rate": 1.5341614906832298e-05, "loss": 0.5465, "step": 247 }, { "epoch": 0.23112767940354148, "grad_norm": 0.21986373983466817, "learning_rate": 1.5403726708074537e-05, "loss": 0.5971, "step": 248 }, { "epoch": 0.2320596458527493, "grad_norm": 0.1786023487363672, "learning_rate": 1.5465838509316772e-05, "loss": 0.5739, "step": 249 }, { "epoch": 0.23299161230195714, "grad_norm": 0.18505241743684242, "learning_rate": 1.5527950310559007e-05, "loss": 0.5582, "step": 250 }, { "epoch": 0.23392357875116496, "grad_norm": 0.20179970436943218, "learning_rate": 1.5590062111801242e-05, "loss": 0.597, "step": 251 }, { "epoch": 0.23485554520037277, "grad_norm": 0.19384786865466022, "learning_rate": 1.565217391304348e-05, "loss": 0.5516, "step": 252 }, { "epoch": 0.23578751164958062, "grad_norm": 0.22633202949704873, "learning_rate": 1.5714285714285715e-05, "loss": 0.5462, "step": 253 }, { "epoch": 0.23671947809878843, "grad_norm": 0.2330966451022343, "learning_rate": 1.5776397515527954e-05, "loss": 0.5972, "step": 254 }, { "epoch": 0.23765144454799628, "grad_norm": 0.22030606663631816, "learning_rate": 1.583850931677019e-05, "loss": 0.5815, "step": 255 }, { "epoch": 0.2385834109972041, "grad_norm": 0.1997832011036174, "learning_rate": 1.5900621118012424e-05, "loss": 0.5516, "step": 256 }, { "epoch": 0.23951537744641194, "grad_norm": 0.19019346863624276, "learning_rate": 1.596273291925466e-05, "loss": 0.5638, "step": 257 }, { "epoch": 0.24044734389561975, "grad_norm": 0.17723371842249236, "learning_rate": 1.6024844720496894e-05, "loss": 0.5622, "step": 258 }, { "epoch": 0.2413793103448276, "grad_norm": 0.18074222826344513, "learning_rate": 1.6086956521739132e-05, "loss": 0.5573, "step": 259 }, { "epoch": 0.2423112767940354, "grad_norm": 0.19566730963557588, "learning_rate": 1.6149068322981367e-05, "loss": 0.6065, "step": 260 }, { "epoch": 0.24324324324324326, "grad_norm": 0.19685338006110695, "learning_rate": 1.6211180124223606e-05, "loss": 0.5716, "step": 261 }, { "epoch": 0.24417520969245107, "grad_norm": 0.19889452281943024, "learning_rate": 1.627329192546584e-05, "loss": 0.6139, "step": 262 }, { "epoch": 0.2451071761416589, "grad_norm": 0.18760336791469165, "learning_rate": 1.6335403726708076e-05, "loss": 0.5637, "step": 263 }, { "epoch": 0.24603914259086673, "grad_norm": 0.18664285252205676, "learning_rate": 1.639751552795031e-05, "loss": 0.5175, "step": 264 }, { "epoch": 0.24697110904007455, "grad_norm": 0.19107889157683822, "learning_rate": 1.645962732919255e-05, "loss": 0.5467, "step": 265 }, { "epoch": 0.2479030754892824, "grad_norm": 0.18615186200451214, "learning_rate": 1.6521739130434785e-05, "loss": 0.5713, "step": 266 }, { "epoch": 0.2488350419384902, "grad_norm": 0.27767619429221985, "learning_rate": 1.658385093167702e-05, "loss": 0.5763, "step": 267 }, { "epoch": 0.24976700838769805, "grad_norm": 0.3000872401308293, "learning_rate": 1.6645962732919258e-05, "loss": 0.5761, "step": 268 }, { "epoch": 0.2506989748369059, "grad_norm": 0.40479384996759615, "learning_rate": 1.670807453416149e-05, "loss": 0.5556, "step": 269 }, { "epoch": 0.2516309412861137, "grad_norm": 0.1898229088675611, "learning_rate": 1.6770186335403728e-05, "loss": 0.5416, "step": 270 }, { "epoch": 0.25256290773532153, "grad_norm": 0.21659362481061462, "learning_rate": 1.6832298136645963e-05, "loss": 0.5221, "step": 271 }, { "epoch": 0.2534948741845294, "grad_norm": 0.23755979436399893, "learning_rate": 1.68944099378882e-05, "loss": 0.5883, "step": 272 }, { "epoch": 0.25442684063373716, "grad_norm": 0.22367948629096315, "learning_rate": 1.6956521739130437e-05, "loss": 0.5816, "step": 273 }, { "epoch": 0.255358807082945, "grad_norm": 0.20488939489292832, "learning_rate": 1.7018633540372672e-05, "loss": 0.5689, "step": 274 }, { "epoch": 0.25629077353215285, "grad_norm": 0.1862888526425489, "learning_rate": 1.7080745341614907e-05, "loss": 0.5805, "step": 275 }, { "epoch": 0.2572227399813607, "grad_norm": 0.20879047203495332, "learning_rate": 1.7142857142857142e-05, "loss": 0.6036, "step": 276 }, { "epoch": 0.2581547064305685, "grad_norm": 0.22750652052087278, "learning_rate": 1.720496894409938e-05, "loss": 0.5395, "step": 277 }, { "epoch": 0.2590866728797763, "grad_norm": 0.1878038213140467, "learning_rate": 1.7267080745341615e-05, "loss": 0.5437, "step": 278 }, { "epoch": 0.26001863932898417, "grad_norm": 0.19121389535550024, "learning_rate": 1.7329192546583854e-05, "loss": 0.5647, "step": 279 }, { "epoch": 0.26095060577819196, "grad_norm": 0.1961930549660334, "learning_rate": 1.739130434782609e-05, "loss": 0.5383, "step": 280 }, { "epoch": 0.2618825722273998, "grad_norm": 0.19535153622441037, "learning_rate": 1.7453416149068324e-05, "loss": 0.5709, "step": 281 }, { "epoch": 0.26281453867660765, "grad_norm": 0.21829614592957994, "learning_rate": 1.751552795031056e-05, "loss": 0.5961, "step": 282 }, { "epoch": 0.2637465051258155, "grad_norm": 0.20244799441016628, "learning_rate": 1.7577639751552797e-05, "loss": 0.5481, "step": 283 }, { "epoch": 0.2646784715750233, "grad_norm": 0.19218144689318692, "learning_rate": 1.7639751552795032e-05, "loss": 0.5447, "step": 284 }, { "epoch": 0.2656104380242311, "grad_norm": 0.19302837948413193, "learning_rate": 1.7701863354037267e-05, "loss": 0.5512, "step": 285 }, { "epoch": 0.26654240447343897, "grad_norm": 0.1938969465676264, "learning_rate": 1.7763975155279506e-05, "loss": 0.5777, "step": 286 }, { "epoch": 0.2674743709226468, "grad_norm": 0.17875685691475898, "learning_rate": 1.782608695652174e-05, "loss": 0.5495, "step": 287 }, { "epoch": 0.2684063373718546, "grad_norm": 0.17538799705662306, "learning_rate": 1.7888198757763976e-05, "loss": 0.5633, "step": 288 }, { "epoch": 0.26933830382106244, "grad_norm": 0.17781212892646506, "learning_rate": 1.795031055900621e-05, "loss": 0.5779, "step": 289 }, { "epoch": 0.2702702702702703, "grad_norm": 0.17027543067135784, "learning_rate": 1.801242236024845e-05, "loss": 0.5502, "step": 290 }, { "epoch": 0.2712022367194781, "grad_norm": 0.19409874980702269, "learning_rate": 1.8074534161490685e-05, "loss": 0.5678, "step": 291 }, { "epoch": 0.2721342031686859, "grad_norm": 0.17867512900051108, "learning_rate": 1.8136645962732923e-05, "loss": 0.5619, "step": 292 }, { "epoch": 0.27306616961789376, "grad_norm": 0.1822728760155399, "learning_rate": 1.8198757763975158e-05, "loss": 0.5514, "step": 293 }, { "epoch": 0.2739981360671016, "grad_norm": 0.20890498232176602, "learning_rate": 1.8260869565217393e-05, "loss": 0.5691, "step": 294 }, { "epoch": 0.2749301025163094, "grad_norm": 0.19159709246760698, "learning_rate": 1.8322981366459628e-05, "loss": 0.5875, "step": 295 }, { "epoch": 0.27586206896551724, "grad_norm": 0.17673449602485752, "learning_rate": 1.8385093167701863e-05, "loss": 0.5379, "step": 296 }, { "epoch": 0.2767940354147251, "grad_norm": 0.17967157437127868, "learning_rate": 1.84472049689441e-05, "loss": 0.5588, "step": 297 }, { "epoch": 0.2777260018639329, "grad_norm": 0.1756914407430521, "learning_rate": 1.8509316770186337e-05, "loss": 0.5815, "step": 298 }, { "epoch": 0.2786579683131407, "grad_norm": 0.19406665631971665, "learning_rate": 1.8571428571428575e-05, "loss": 0.5531, "step": 299 }, { "epoch": 0.27958993476234856, "grad_norm": 0.5080676372225253, "learning_rate": 1.863354037267081e-05, "loss": 0.609, "step": 300 }, { "epoch": 0.2805219012115564, "grad_norm": 0.23200098941098327, "learning_rate": 1.8695652173913045e-05, "loss": 0.5567, "step": 301 }, { "epoch": 0.2814538676607642, "grad_norm": 0.17468838027479205, "learning_rate": 1.875776397515528e-05, "loss": 0.5513, "step": 302 }, { "epoch": 0.28238583410997203, "grad_norm": 0.1686001736996894, "learning_rate": 1.881987577639752e-05, "loss": 0.5353, "step": 303 }, { "epoch": 0.2833178005591799, "grad_norm": 0.18015841561422136, "learning_rate": 1.8881987577639754e-05, "loss": 0.5752, "step": 304 }, { "epoch": 0.2842497670083877, "grad_norm": 0.1759490332338303, "learning_rate": 1.894409937888199e-05, "loss": 0.5569, "step": 305 }, { "epoch": 0.2851817334575955, "grad_norm": 0.17803079437851896, "learning_rate": 1.9006211180124224e-05, "loss": 0.5282, "step": 306 }, { "epoch": 0.28611369990680335, "grad_norm": 0.17355927181609834, "learning_rate": 1.906832298136646e-05, "loss": 0.5386, "step": 307 }, { "epoch": 0.2870456663560112, "grad_norm": 0.1662122617531256, "learning_rate": 1.9130434782608697e-05, "loss": 0.5398, "step": 308 }, { "epoch": 0.287977632805219, "grad_norm": 0.18154101501139733, "learning_rate": 1.9192546583850932e-05, "loss": 0.5412, "step": 309 }, { "epoch": 0.28890959925442683, "grad_norm": 0.18694218775074156, "learning_rate": 1.925465838509317e-05, "loss": 0.5735, "step": 310 }, { "epoch": 0.2898415657036347, "grad_norm": 0.21918083844708672, "learning_rate": 1.9316770186335406e-05, "loss": 0.5783, "step": 311 }, { "epoch": 0.2907735321528425, "grad_norm": 0.17461180179368901, "learning_rate": 1.937888198757764e-05, "loss": 0.5449, "step": 312 }, { "epoch": 0.2917054986020503, "grad_norm": 0.20735684031826418, "learning_rate": 1.9440993788819876e-05, "loss": 0.5651, "step": 313 }, { "epoch": 0.29263746505125815, "grad_norm": 0.16548440909119969, "learning_rate": 1.950310559006211e-05, "loss": 0.5264, "step": 314 }, { "epoch": 0.293569431500466, "grad_norm": 0.5353573826075055, "learning_rate": 1.956521739130435e-05, "loss": 0.5751, "step": 315 }, { "epoch": 0.29450139794967384, "grad_norm": 0.17402872459404528, "learning_rate": 1.9627329192546585e-05, "loss": 0.5188, "step": 316 }, { "epoch": 0.2954333643988816, "grad_norm": 0.18655914656323475, "learning_rate": 1.9689440993788823e-05, "loss": 0.5267, "step": 317 }, { "epoch": 0.29636533084808947, "grad_norm": 0.22569558415766544, "learning_rate": 1.9751552795031058e-05, "loss": 0.5478, "step": 318 }, { "epoch": 0.2972972972972973, "grad_norm": 0.19208033279173237, "learning_rate": 1.9813664596273293e-05, "loss": 0.5319, "step": 319 }, { "epoch": 0.2982292637465051, "grad_norm": 0.20503464606593874, "learning_rate": 1.9875776397515528e-05, "loss": 0.5588, "step": 320 }, { "epoch": 0.29916123019571295, "grad_norm": 0.20302921136635202, "learning_rate": 1.9937888198757767e-05, "loss": 0.523, "step": 321 }, { "epoch": 0.3000931966449208, "grad_norm": 0.17397345035979556, "learning_rate": 2e-05, "loss": 0.5389, "step": 322 }, { "epoch": 0.30102516309412863, "grad_norm": 0.16995528793309664, "learning_rate": 1.999309630652399e-05, "loss": 0.536, "step": 323 }, { "epoch": 0.3019571295433364, "grad_norm": 0.17365243687401985, "learning_rate": 1.9986192613047983e-05, "loss": 0.5364, "step": 324 }, { "epoch": 0.30288909599254427, "grad_norm": 0.20169501205281173, "learning_rate": 1.9979288919571972e-05, "loss": 0.534, "step": 325 }, { "epoch": 0.3038210624417521, "grad_norm": 0.1737214008361505, "learning_rate": 1.997238522609596e-05, "loss": 0.5377, "step": 326 }, { "epoch": 0.3047530288909599, "grad_norm": 0.18729688030697347, "learning_rate": 1.9965481532619954e-05, "loss": 0.5479, "step": 327 }, { "epoch": 0.30568499534016774, "grad_norm": 0.20204799642324028, "learning_rate": 1.9958577839143946e-05, "loss": 0.5388, "step": 328 }, { "epoch": 0.3066169617893756, "grad_norm": 0.18534869726657918, "learning_rate": 1.9951674145667935e-05, "loss": 0.5695, "step": 329 }, { "epoch": 0.30754892823858343, "grad_norm": 0.20500372910212514, "learning_rate": 1.9944770452191924e-05, "loss": 0.5394, "step": 330 }, { "epoch": 0.3084808946877912, "grad_norm": 0.19779593506051052, "learning_rate": 1.9937866758715913e-05, "loss": 0.5391, "step": 331 }, { "epoch": 0.30941286113699906, "grad_norm": 0.19197565902905606, "learning_rate": 1.9930963065239906e-05, "loss": 0.5399, "step": 332 }, { "epoch": 0.3103448275862069, "grad_norm": 0.20089281402992662, "learning_rate": 1.9924059371763895e-05, "loss": 0.5422, "step": 333 }, { "epoch": 0.31127679403541475, "grad_norm": 0.19833688225571003, "learning_rate": 1.9917155678287884e-05, "loss": 0.551, "step": 334 }, { "epoch": 0.31220876048462254, "grad_norm": 0.16764362086305679, "learning_rate": 1.9910251984811876e-05, "loss": 0.5422, "step": 335 }, { "epoch": 0.3131407269338304, "grad_norm": 0.22057561019510574, "learning_rate": 1.9903348291335865e-05, "loss": 0.575, "step": 336 }, { "epoch": 0.3140726933830382, "grad_norm": 0.17850860412387784, "learning_rate": 1.9896444597859858e-05, "loss": 0.5392, "step": 337 }, { "epoch": 0.315004659832246, "grad_norm": 0.1722284713997011, "learning_rate": 1.9889540904383847e-05, "loss": 0.5483, "step": 338 }, { "epoch": 0.31593662628145386, "grad_norm": 0.17193306229499267, "learning_rate": 1.988263721090784e-05, "loss": 0.5215, "step": 339 }, { "epoch": 0.3168685927306617, "grad_norm": 0.1682648553388606, "learning_rate": 1.9875733517431828e-05, "loss": 0.5513, "step": 340 }, { "epoch": 0.31780055917986955, "grad_norm": 0.1755646315218664, "learning_rate": 1.9868829823955817e-05, "loss": 0.5387, "step": 341 }, { "epoch": 0.31873252562907733, "grad_norm": 0.18998416253201317, "learning_rate": 1.9861926130479806e-05, "loss": 0.562, "step": 342 }, { "epoch": 0.3196644920782852, "grad_norm": 0.16570550917430776, "learning_rate": 1.98550224370038e-05, "loss": 0.5278, "step": 343 }, { "epoch": 0.320596458527493, "grad_norm": 0.185409466422482, "learning_rate": 1.9848118743527788e-05, "loss": 0.5637, "step": 344 }, { "epoch": 0.32152842497670087, "grad_norm": 0.3120500931751614, "learning_rate": 1.984121505005178e-05, "loss": 0.5105, "step": 345 }, { "epoch": 0.32246039142590865, "grad_norm": 0.18567282672812946, "learning_rate": 1.983431135657577e-05, "loss": 0.5458, "step": 346 }, { "epoch": 0.3233923578751165, "grad_norm": 0.1926172026543041, "learning_rate": 1.982740766309976e-05, "loss": 0.5887, "step": 347 }, { "epoch": 0.32432432432432434, "grad_norm": 0.17865645702029637, "learning_rate": 1.982050396962375e-05, "loss": 0.557, "step": 348 }, { "epoch": 0.32525629077353213, "grad_norm": 0.21680790949964387, "learning_rate": 1.981360027614774e-05, "loss": 0.5487, "step": 349 }, { "epoch": 0.32618825722274, "grad_norm": 0.1721110070025601, "learning_rate": 1.9806696582671732e-05, "loss": 0.5403, "step": 350 }, { "epoch": 0.3271202236719478, "grad_norm": 0.20251756491198586, "learning_rate": 1.979979288919572e-05, "loss": 0.5459, "step": 351 }, { "epoch": 0.32805219012115566, "grad_norm": 0.17116468193576467, "learning_rate": 1.979288919571971e-05, "loss": 0.5381, "step": 352 }, { "epoch": 0.32898415657036345, "grad_norm": 0.17953646530055048, "learning_rate": 1.9785985502243702e-05, "loss": 0.5263, "step": 353 }, { "epoch": 0.3299161230195713, "grad_norm": 0.18010931291898608, "learning_rate": 1.977908180876769e-05, "loss": 0.5325, "step": 354 }, { "epoch": 0.33084808946877914, "grad_norm": 0.18411057609288908, "learning_rate": 1.9772178115291684e-05, "loss": 0.5583, "step": 355 }, { "epoch": 0.3317800559179869, "grad_norm": 0.1991712866968987, "learning_rate": 1.9765274421815673e-05, "loss": 0.5089, "step": 356 }, { "epoch": 0.33271202236719477, "grad_norm": 0.18128012629472473, "learning_rate": 1.9758370728339665e-05, "loss": 0.5406, "step": 357 }, { "epoch": 0.3336439888164026, "grad_norm": 0.20302614599058091, "learning_rate": 1.9751467034863654e-05, "loss": 0.5466, "step": 358 }, { "epoch": 0.33457595526561046, "grad_norm": 0.18455549816471245, "learning_rate": 1.9744563341387643e-05, "loss": 0.5681, "step": 359 }, { "epoch": 0.33550792171481825, "grad_norm": 0.1985949802350472, "learning_rate": 1.9737659647911633e-05, "loss": 0.5169, "step": 360 }, { "epoch": 0.3364398881640261, "grad_norm": 0.1709091536571559, "learning_rate": 1.9730755954435625e-05, "loss": 0.5201, "step": 361 }, { "epoch": 0.33737185461323393, "grad_norm": 0.18568990737941676, "learning_rate": 1.9723852260959614e-05, "loss": 0.5194, "step": 362 }, { "epoch": 0.3383038210624418, "grad_norm": 0.2018126650328682, "learning_rate": 1.9716948567483606e-05, "loss": 0.5343, "step": 363 }, { "epoch": 0.33923578751164957, "grad_norm": 0.18427066397749298, "learning_rate": 1.9710044874007595e-05, "loss": 0.5646, "step": 364 }, { "epoch": 0.3401677539608574, "grad_norm": 0.19132569538111646, "learning_rate": 1.9703141180531588e-05, "loss": 0.5245, "step": 365 }, { "epoch": 0.34109972041006525, "grad_norm": 0.18554283600675056, "learning_rate": 1.9696237487055577e-05, "loss": 0.5359, "step": 366 }, { "epoch": 0.34203168685927304, "grad_norm": 0.2124622584310907, "learning_rate": 1.9689333793579566e-05, "loss": 0.5919, "step": 367 }, { "epoch": 0.3429636533084809, "grad_norm": 0.17084155808665397, "learning_rate": 1.968243010010356e-05, "loss": 0.5401, "step": 368 }, { "epoch": 0.34389561975768873, "grad_norm": 0.17672196553254726, "learning_rate": 1.9675526406627547e-05, "loss": 0.515, "step": 369 }, { "epoch": 0.3448275862068966, "grad_norm": 0.17636359608836233, "learning_rate": 1.9668622713151536e-05, "loss": 0.551, "step": 370 }, { "epoch": 0.34575955265610436, "grad_norm": 0.17674791890453187, "learning_rate": 1.9661719019675526e-05, "loss": 0.5225, "step": 371 }, { "epoch": 0.3466915191053122, "grad_norm": 0.18045827448487498, "learning_rate": 1.9654815326199518e-05, "loss": 0.5253, "step": 372 }, { "epoch": 0.34762348555452005, "grad_norm": 0.18055972402330703, "learning_rate": 1.9647911632723507e-05, "loss": 0.5388, "step": 373 }, { "epoch": 0.34855545200372784, "grad_norm": 0.1708979659452527, "learning_rate": 1.96410079392475e-05, "loss": 0.5075, "step": 374 }, { "epoch": 0.3494874184529357, "grad_norm": 0.20428443678358058, "learning_rate": 1.9634104245771492e-05, "loss": 0.5546, "step": 375 }, { "epoch": 0.3504193849021435, "grad_norm": 0.1750279449584709, "learning_rate": 1.962720055229548e-05, "loss": 0.538, "step": 376 }, { "epoch": 0.35135135135135137, "grad_norm": 0.2672888780894872, "learning_rate": 1.962029685881947e-05, "loss": 0.574, "step": 377 }, { "epoch": 0.35228331780055916, "grad_norm": 0.17164089417701317, "learning_rate": 1.961339316534346e-05, "loss": 0.5323, "step": 378 }, { "epoch": 0.353215284249767, "grad_norm": 0.21724459303628876, "learning_rate": 1.960648947186745e-05, "loss": 0.5577, "step": 379 }, { "epoch": 0.35414725069897485, "grad_norm": 0.20240059987890285, "learning_rate": 1.959958577839144e-05, "loss": 0.54, "step": 380 }, { "epoch": 0.3550792171481827, "grad_norm": 0.17138806271153204, "learning_rate": 1.959268208491543e-05, "loss": 0.5441, "step": 381 }, { "epoch": 0.3560111835973905, "grad_norm": 0.16613056365504175, "learning_rate": 1.9585778391439422e-05, "loss": 0.5302, "step": 382 }, { "epoch": 0.3569431500465983, "grad_norm": 0.25097862962497985, "learning_rate": 1.957887469796341e-05, "loss": 0.5761, "step": 383 }, { "epoch": 0.35787511649580617, "grad_norm": 0.17910920131613506, "learning_rate": 1.9571971004487403e-05, "loss": 0.5577, "step": 384 }, { "epoch": 0.35880708294501396, "grad_norm": 0.17999546076581407, "learning_rate": 1.9565067311011392e-05, "loss": 0.5431, "step": 385 }, { "epoch": 0.3597390493942218, "grad_norm": 0.17342204736908776, "learning_rate": 1.9558163617535385e-05, "loss": 0.5244, "step": 386 }, { "epoch": 0.36067101584342964, "grad_norm": 0.2070957879182263, "learning_rate": 1.9551259924059374e-05, "loss": 0.515, "step": 387 }, { "epoch": 0.3616029822926375, "grad_norm": 0.19995711997474253, "learning_rate": 1.9544356230583363e-05, "loss": 0.539, "step": 388 }, { "epoch": 0.3625349487418453, "grad_norm": 0.1759068478095124, "learning_rate": 1.9537452537107352e-05, "loss": 0.5456, "step": 389 }, { "epoch": 0.3634669151910531, "grad_norm": 0.18692661657170506, "learning_rate": 1.9530548843631344e-05, "loss": 0.5389, "step": 390 }, { "epoch": 0.36439888164026096, "grad_norm": 0.17479864431764314, "learning_rate": 1.9523645150155333e-05, "loss": 0.5706, "step": 391 }, { "epoch": 0.36533084808946875, "grad_norm": 0.18268653315762792, "learning_rate": 1.9516741456679326e-05, "loss": 0.4916, "step": 392 }, { "epoch": 0.3662628145386766, "grad_norm": 0.22406805243120256, "learning_rate": 1.9509837763203315e-05, "loss": 0.5254, "step": 393 }, { "epoch": 0.36719478098788444, "grad_norm": 0.17687534338759894, "learning_rate": 1.9502934069727307e-05, "loss": 0.5471, "step": 394 }, { "epoch": 0.3681267474370923, "grad_norm": 0.18803496121088278, "learning_rate": 1.9496030376251296e-05, "loss": 0.5616, "step": 395 }, { "epoch": 0.36905871388630007, "grad_norm": 0.17429696640434417, "learning_rate": 1.9489126682775285e-05, "loss": 0.5221, "step": 396 }, { "epoch": 0.3699906803355079, "grad_norm": 0.21364482206135685, "learning_rate": 1.9482222989299278e-05, "loss": 0.5019, "step": 397 }, { "epoch": 0.37092264678471576, "grad_norm": 0.18399365940130605, "learning_rate": 1.9475319295823267e-05, "loss": 0.5481, "step": 398 }, { "epoch": 0.3718546132339236, "grad_norm": 0.1702696133011687, "learning_rate": 1.9468415602347256e-05, "loss": 0.5233, "step": 399 }, { "epoch": 0.3727865796831314, "grad_norm": 0.1974653328070696, "learning_rate": 1.9461511908871248e-05, "loss": 0.5127, "step": 400 }, { "epoch": 0.37371854613233924, "grad_norm": 0.1822993036459006, "learning_rate": 1.9454608215395237e-05, "loss": 0.5354, "step": 401 }, { "epoch": 0.3746505125815471, "grad_norm": 0.2079809256548964, "learning_rate": 1.944770452191923e-05, "loss": 0.5679, "step": 402 }, { "epoch": 0.37558247903075487, "grad_norm": 0.2333432435475023, "learning_rate": 1.944080082844322e-05, "loss": 0.5305, "step": 403 }, { "epoch": 0.3765144454799627, "grad_norm": 0.20011472448275555, "learning_rate": 1.943389713496721e-05, "loss": 0.5449, "step": 404 }, { "epoch": 0.37744641192917056, "grad_norm": 0.18252451440475512, "learning_rate": 1.94269934414912e-05, "loss": 0.5318, "step": 405 }, { "epoch": 0.3783783783783784, "grad_norm": 0.18970712639549925, "learning_rate": 1.942008974801519e-05, "loss": 0.5331, "step": 406 }, { "epoch": 0.3793103448275862, "grad_norm": 0.3054054072741445, "learning_rate": 1.941318605453918e-05, "loss": 0.5377, "step": 407 }, { "epoch": 0.38024231127679403, "grad_norm": 0.1679743484516824, "learning_rate": 1.940628236106317e-05, "loss": 0.5347, "step": 408 }, { "epoch": 0.3811742777260019, "grad_norm": 0.19335889379040402, "learning_rate": 1.939937866758716e-05, "loss": 0.5419, "step": 409 }, { "epoch": 0.3821062441752097, "grad_norm": 0.1736680055170217, "learning_rate": 1.9392474974111152e-05, "loss": 0.5394, "step": 410 }, { "epoch": 0.3830382106244175, "grad_norm": 0.20238747773191848, "learning_rate": 1.938557128063514e-05, "loss": 0.5378, "step": 411 }, { "epoch": 0.38397017707362535, "grad_norm": 0.1897693982388872, "learning_rate": 1.9378667587159134e-05, "loss": 0.5595, "step": 412 }, { "epoch": 0.3849021435228332, "grad_norm": 0.16675953346974584, "learning_rate": 1.9371763893683123e-05, "loss": 0.4976, "step": 413 }, { "epoch": 0.385834109972041, "grad_norm": 0.1793449562479549, "learning_rate": 1.9364860200207112e-05, "loss": 0.4799, "step": 414 }, { "epoch": 0.38676607642124883, "grad_norm": 0.16670846809642864, "learning_rate": 1.9357956506731104e-05, "loss": 0.5353, "step": 415 }, { "epoch": 0.38769804287045667, "grad_norm": 0.19520185157701506, "learning_rate": 1.9351052813255093e-05, "loss": 0.5808, "step": 416 }, { "epoch": 0.3886300093196645, "grad_norm": 0.18708830545494362, "learning_rate": 1.9344149119779082e-05, "loss": 0.5449, "step": 417 }, { "epoch": 0.3895619757688723, "grad_norm": 0.1965036932502825, "learning_rate": 1.933724542630307e-05, "loss": 0.5304, "step": 418 }, { "epoch": 0.39049394221808015, "grad_norm": 0.21557910799763708, "learning_rate": 1.9330341732827064e-05, "loss": 0.556, "step": 419 }, { "epoch": 0.391425908667288, "grad_norm": 0.1720737760335889, "learning_rate": 1.9323438039351056e-05, "loss": 0.5107, "step": 420 }, { "epoch": 0.3923578751164958, "grad_norm": 0.18832022391902087, "learning_rate": 1.9316534345875045e-05, "loss": 0.5381, "step": 421 }, { "epoch": 0.3932898415657036, "grad_norm": 0.17903901154372062, "learning_rate": 1.9309630652399034e-05, "loss": 0.5405, "step": 422 }, { "epoch": 0.39422180801491147, "grad_norm": 0.1948176782258854, "learning_rate": 1.9302726958923027e-05, "loss": 0.5721, "step": 423 }, { "epoch": 0.3951537744641193, "grad_norm": 0.2011493735251545, "learning_rate": 1.9295823265447016e-05, "loss": 0.5263, "step": 424 }, { "epoch": 0.3960857409133271, "grad_norm": 0.17165170920006226, "learning_rate": 1.9288919571971005e-05, "loss": 0.5291, "step": 425 }, { "epoch": 0.39701770736253494, "grad_norm": 0.19233000240296147, "learning_rate": 1.9282015878494997e-05, "loss": 0.5548, "step": 426 }, { "epoch": 0.3979496738117428, "grad_norm": 0.19712846354048344, "learning_rate": 1.9275112185018986e-05, "loss": 0.5321, "step": 427 }, { "epoch": 0.39888164026095063, "grad_norm": 0.16898427161283544, "learning_rate": 1.9268208491542975e-05, "loss": 0.5064, "step": 428 }, { "epoch": 0.3998136067101584, "grad_norm": 0.1701187933344972, "learning_rate": 1.9261304798066968e-05, "loss": 0.5023, "step": 429 }, { "epoch": 0.40074557315936626, "grad_norm": 0.17506210820281926, "learning_rate": 1.925440110459096e-05, "loss": 0.5355, "step": 430 }, { "epoch": 0.4016775396085741, "grad_norm": 0.17719840849951285, "learning_rate": 1.924749741111495e-05, "loss": 0.5391, "step": 431 }, { "epoch": 0.4026095060577819, "grad_norm": 0.16846345207406374, "learning_rate": 1.9240593717638938e-05, "loss": 0.5478, "step": 432 }, { "epoch": 0.40354147250698974, "grad_norm": 0.1758887819775569, "learning_rate": 1.9233690024162927e-05, "loss": 0.5298, "step": 433 }, { "epoch": 0.4044734389561976, "grad_norm": 0.16607771369271107, "learning_rate": 1.922678633068692e-05, "loss": 0.5444, "step": 434 }, { "epoch": 0.40540540540540543, "grad_norm": 0.18276808135897016, "learning_rate": 1.921988263721091e-05, "loss": 0.5377, "step": 435 }, { "epoch": 0.4063373718546132, "grad_norm": 0.1766149831792003, "learning_rate": 1.9212978943734898e-05, "loss": 0.5477, "step": 436 }, { "epoch": 0.40726933830382106, "grad_norm": 0.1797451893638745, "learning_rate": 1.920607525025889e-05, "loss": 0.5473, "step": 437 }, { "epoch": 0.4082013047530289, "grad_norm": 0.1719904741935769, "learning_rate": 1.919917155678288e-05, "loss": 0.5278, "step": 438 }, { "epoch": 0.4091332712022367, "grad_norm": 0.17631553557973575, "learning_rate": 1.919226786330687e-05, "loss": 0.5207, "step": 439 }, { "epoch": 0.41006523765144454, "grad_norm": 0.19473555723419755, "learning_rate": 1.918536416983086e-05, "loss": 0.5314, "step": 440 }, { "epoch": 0.4109972041006524, "grad_norm": 0.189694713978434, "learning_rate": 1.9178460476354853e-05, "loss": 0.5298, "step": 441 }, { "epoch": 0.4119291705498602, "grad_norm": 0.1949900274794338, "learning_rate": 1.9171556782878842e-05, "loss": 0.5373, "step": 442 }, { "epoch": 0.412861136999068, "grad_norm": 0.17503674655942839, "learning_rate": 1.916465308940283e-05, "loss": 0.5198, "step": 443 }, { "epoch": 0.41379310344827586, "grad_norm": 0.18433584086883067, "learning_rate": 1.9157749395926824e-05, "loss": 0.5093, "step": 444 }, { "epoch": 0.4147250698974837, "grad_norm": 0.2223873872037597, "learning_rate": 1.9150845702450813e-05, "loss": 0.5488, "step": 445 }, { "epoch": 0.41565703634669154, "grad_norm": 0.202042411514076, "learning_rate": 1.91439420089748e-05, "loss": 0.5325, "step": 446 }, { "epoch": 0.41658900279589933, "grad_norm": 0.17066811143551908, "learning_rate": 1.9137038315498794e-05, "loss": 0.5113, "step": 447 }, { "epoch": 0.4175209692451072, "grad_norm": 0.16939039559281477, "learning_rate": 1.9130134622022783e-05, "loss": 0.5139, "step": 448 }, { "epoch": 0.418452935694315, "grad_norm": 0.18708313956732062, "learning_rate": 1.9123230928546776e-05, "loss": 0.5275, "step": 449 }, { "epoch": 0.4193849021435228, "grad_norm": 0.20329568332628087, "learning_rate": 1.9116327235070765e-05, "loss": 0.5048, "step": 450 }, { "epoch": 0.42031686859273065, "grad_norm": 0.192953928510773, "learning_rate": 1.9109423541594754e-05, "loss": 0.521, "step": 451 }, { "epoch": 0.4212488350419385, "grad_norm": 0.17734035613927393, "learning_rate": 1.9102519848118746e-05, "loss": 0.5302, "step": 452 }, { "epoch": 0.42218080149114634, "grad_norm": 0.17485031163488354, "learning_rate": 1.9095616154642735e-05, "loss": 0.5322, "step": 453 }, { "epoch": 0.42311276794035413, "grad_norm": 0.18239588354046976, "learning_rate": 1.9088712461166724e-05, "loss": 0.528, "step": 454 }, { "epoch": 0.424044734389562, "grad_norm": 0.16894411645465665, "learning_rate": 1.9081808767690717e-05, "loss": 0.5107, "step": 455 }, { "epoch": 0.4249767008387698, "grad_norm": 0.16217186269758022, "learning_rate": 1.9074905074214706e-05, "loss": 0.4936, "step": 456 }, { "epoch": 0.42590866728797766, "grad_norm": 0.2239799034885214, "learning_rate": 1.9068001380738698e-05, "loss": 0.5302, "step": 457 }, { "epoch": 0.42684063373718545, "grad_norm": 0.19409843602039029, "learning_rate": 1.9061097687262687e-05, "loss": 0.5323, "step": 458 }, { "epoch": 0.4277726001863933, "grad_norm": 0.1844092990801573, "learning_rate": 1.905419399378668e-05, "loss": 0.5123, "step": 459 }, { "epoch": 0.42870456663560114, "grad_norm": 0.18485258310857944, "learning_rate": 1.904729030031067e-05, "loss": 0.5335, "step": 460 }, { "epoch": 0.4296365330848089, "grad_norm": 0.22495025764153986, "learning_rate": 1.9040386606834658e-05, "loss": 0.5218, "step": 461 }, { "epoch": 0.43056849953401677, "grad_norm": 0.1828617036717348, "learning_rate": 1.9033482913358647e-05, "loss": 0.4971, "step": 462 }, { "epoch": 0.4315004659832246, "grad_norm": 0.19080064600071525, "learning_rate": 1.902657921988264e-05, "loss": 0.5095, "step": 463 }, { "epoch": 0.43243243243243246, "grad_norm": 0.1654312884438549, "learning_rate": 1.9019675526406628e-05, "loss": 0.4933, "step": 464 }, { "epoch": 0.43336439888164024, "grad_norm": 0.212142567617406, "learning_rate": 1.901277183293062e-05, "loss": 0.5661, "step": 465 }, { "epoch": 0.4342963653308481, "grad_norm": 0.18209754996023134, "learning_rate": 1.900586813945461e-05, "loss": 0.51, "step": 466 }, { "epoch": 0.43522833178005593, "grad_norm": 0.16852181798990704, "learning_rate": 1.8998964445978602e-05, "loss": 0.5616, "step": 467 }, { "epoch": 0.4361602982292637, "grad_norm": 0.181692516439994, "learning_rate": 1.899206075250259e-05, "loss": 0.5409, "step": 468 }, { "epoch": 0.43709226467847156, "grad_norm": 0.18670124641128263, "learning_rate": 1.898515705902658e-05, "loss": 0.5338, "step": 469 }, { "epoch": 0.4380242311276794, "grad_norm": 0.16229624297367554, "learning_rate": 1.8978253365550572e-05, "loss": 0.5196, "step": 470 }, { "epoch": 0.43895619757688725, "grad_norm": 0.1921284750664304, "learning_rate": 1.897134967207456e-05, "loss": 0.5072, "step": 471 }, { "epoch": 0.43988816402609504, "grad_norm": 0.22054938514557543, "learning_rate": 1.896444597859855e-05, "loss": 0.5542, "step": 472 }, { "epoch": 0.4408201304753029, "grad_norm": 0.18030136462122032, "learning_rate": 1.895754228512254e-05, "loss": 0.5315, "step": 473 }, { "epoch": 0.44175209692451073, "grad_norm": 0.16673129956413477, "learning_rate": 1.8950638591646532e-05, "loss": 0.4989, "step": 474 }, { "epoch": 0.4426840633737186, "grad_norm": 0.2209840532430861, "learning_rate": 1.894373489817052e-05, "loss": 0.5126, "step": 475 }, { "epoch": 0.44361602982292636, "grad_norm": 0.22955392702466187, "learning_rate": 1.8936831204694513e-05, "loss": 0.5593, "step": 476 }, { "epoch": 0.4445479962721342, "grad_norm": 0.18335160355333843, "learning_rate": 1.8929927511218506e-05, "loss": 0.5212, "step": 477 }, { "epoch": 0.44547996272134205, "grad_norm": 0.18290166379604136, "learning_rate": 1.8923023817742495e-05, "loss": 0.5463, "step": 478 }, { "epoch": 0.44641192917054984, "grad_norm": 0.2239948673084491, "learning_rate": 1.8916120124266484e-05, "loss": 0.5154, "step": 479 }, { "epoch": 0.4473438956197577, "grad_norm": 0.18317548734399, "learning_rate": 1.8909216430790473e-05, "loss": 0.5331, "step": 480 }, { "epoch": 0.4482758620689655, "grad_norm": 0.19337714644207532, "learning_rate": 1.8902312737314465e-05, "loss": 0.5432, "step": 481 }, { "epoch": 0.44920782851817337, "grad_norm": 0.21891478316066892, "learning_rate": 1.8895409043838454e-05, "loss": 0.5407, "step": 482 }, { "epoch": 0.45013979496738116, "grad_norm": 0.1773639173039466, "learning_rate": 1.8888505350362443e-05, "loss": 0.5245, "step": 483 }, { "epoch": 0.451071761416589, "grad_norm": 0.16671829146349765, "learning_rate": 1.8881601656886436e-05, "loss": 0.4939, "step": 484 }, { "epoch": 0.45200372786579684, "grad_norm": 0.20835824098635924, "learning_rate": 1.8874697963410425e-05, "loss": 0.5245, "step": 485 }, { "epoch": 0.45293569431500463, "grad_norm": 0.1963488033292353, "learning_rate": 1.8867794269934417e-05, "loss": 0.552, "step": 486 }, { "epoch": 0.4538676607642125, "grad_norm": 0.18410589333047908, "learning_rate": 1.8860890576458406e-05, "loss": 0.5161, "step": 487 }, { "epoch": 0.4547996272134203, "grad_norm": 0.1793341958363089, "learning_rate": 1.88539868829824e-05, "loss": 0.5242, "step": 488 }, { "epoch": 0.45573159366262816, "grad_norm": 0.19800442828698867, "learning_rate": 1.8847083189506388e-05, "loss": 0.516, "step": 489 }, { "epoch": 0.45666356011183595, "grad_norm": 0.1618303039248095, "learning_rate": 1.8840179496030377e-05, "loss": 0.4879, "step": 490 }, { "epoch": 0.4575955265610438, "grad_norm": 0.18304676006766568, "learning_rate": 1.8833275802554366e-05, "loss": 0.4985, "step": 491 }, { "epoch": 0.45852749301025164, "grad_norm": 0.2596353204759088, "learning_rate": 1.882637210907836e-05, "loss": 0.5519, "step": 492 }, { "epoch": 0.4594594594594595, "grad_norm": 0.189683609537879, "learning_rate": 1.8819468415602347e-05, "loss": 0.4964, "step": 493 }, { "epoch": 0.4603914259086673, "grad_norm": 0.21348410988039773, "learning_rate": 1.881256472212634e-05, "loss": 0.5215, "step": 494 }, { "epoch": 0.4613233923578751, "grad_norm": 0.19305855072492478, "learning_rate": 1.880566102865033e-05, "loss": 0.5585, "step": 495 }, { "epoch": 0.46225535880708296, "grad_norm": 0.18331737985056523, "learning_rate": 1.879875733517432e-05, "loss": 0.5293, "step": 496 }, { "epoch": 0.46318732525629075, "grad_norm": 0.2014364496635696, "learning_rate": 1.879185364169831e-05, "loss": 0.5403, "step": 497 }, { "epoch": 0.4641192917054986, "grad_norm": 0.17460947119156472, "learning_rate": 1.87849499482223e-05, "loss": 0.5127, "step": 498 }, { "epoch": 0.46505125815470644, "grad_norm": 0.1830437001532292, "learning_rate": 1.8778046254746292e-05, "loss": 0.5278, "step": 499 }, { "epoch": 0.4659832246039143, "grad_norm": 0.18600349230388025, "learning_rate": 1.877114256127028e-05, "loss": 0.5325, "step": 500 }, { "epoch": 0.46691519105312207, "grad_norm": 0.18167513961510479, "learning_rate": 1.876423886779427e-05, "loss": 0.5351, "step": 501 }, { "epoch": 0.4678471575023299, "grad_norm": 0.18438845078745494, "learning_rate": 1.8757335174318262e-05, "loss": 0.5204, "step": 502 }, { "epoch": 0.46877912395153776, "grad_norm": 0.16839655969660633, "learning_rate": 1.875043148084225e-05, "loss": 0.4964, "step": 503 }, { "epoch": 0.46971109040074555, "grad_norm": 0.1990069419314965, "learning_rate": 1.8743527787366244e-05, "loss": 0.5061, "step": 504 }, { "epoch": 0.4706430568499534, "grad_norm": 0.17581290602544275, "learning_rate": 1.8736624093890233e-05, "loss": 0.5085, "step": 505 }, { "epoch": 0.47157502329916123, "grad_norm": 0.1992090922092487, "learning_rate": 1.8729720400414225e-05, "loss": 0.5253, "step": 506 }, { "epoch": 0.4725069897483691, "grad_norm": 0.1933568504253529, "learning_rate": 1.8722816706938214e-05, "loss": 0.5305, "step": 507 }, { "epoch": 0.47343895619757687, "grad_norm": 0.18324220411619416, "learning_rate": 1.8715913013462203e-05, "loss": 0.5273, "step": 508 }, { "epoch": 0.4743709226467847, "grad_norm": 0.20390052463140876, "learning_rate": 1.8709009319986192e-05, "loss": 0.5465, "step": 509 }, { "epoch": 0.47530288909599255, "grad_norm": 0.21055047065691182, "learning_rate": 1.8702105626510185e-05, "loss": 0.4886, "step": 510 }, { "epoch": 0.4762348555452004, "grad_norm": 0.19861082176346642, "learning_rate": 1.8695201933034174e-05, "loss": 0.5148, "step": 511 }, { "epoch": 0.4771668219944082, "grad_norm": 0.19606910289220897, "learning_rate": 1.8688298239558166e-05, "loss": 0.5108, "step": 512 }, { "epoch": 0.47809878844361603, "grad_norm": 0.19784151689130433, "learning_rate": 1.8681394546082155e-05, "loss": 0.5488, "step": 513 }, { "epoch": 0.4790307548928239, "grad_norm": 0.16732137883172718, "learning_rate": 1.8674490852606148e-05, "loss": 0.514, "step": 514 }, { "epoch": 0.47996272134203166, "grad_norm": 0.19144808412888514, "learning_rate": 1.8667587159130137e-05, "loss": 0.5157, "step": 515 }, { "epoch": 0.4808946877912395, "grad_norm": 0.16576428745139118, "learning_rate": 1.8660683465654126e-05, "loss": 0.5193, "step": 516 }, { "epoch": 0.48182665424044735, "grad_norm": 0.21949845350160604, "learning_rate": 1.8653779772178118e-05, "loss": 0.5665, "step": 517 }, { "epoch": 0.4827586206896552, "grad_norm": 0.17748428608406333, "learning_rate": 1.8646876078702107e-05, "loss": 0.5048, "step": 518 }, { "epoch": 0.483690587138863, "grad_norm": 0.177253926344825, "learning_rate": 1.8639972385226096e-05, "loss": 0.517, "step": 519 }, { "epoch": 0.4846225535880708, "grad_norm": 0.16850006029284803, "learning_rate": 1.8633068691750085e-05, "loss": 0.5233, "step": 520 }, { "epoch": 0.48555452003727867, "grad_norm": 0.1796461063991708, "learning_rate": 1.8626164998274078e-05, "loss": 0.4983, "step": 521 }, { "epoch": 0.4864864864864865, "grad_norm": 0.18897153333359798, "learning_rate": 1.861926130479807e-05, "loss": 0.5268, "step": 522 }, { "epoch": 0.4874184529356943, "grad_norm": 0.17448071612942045, "learning_rate": 1.861235761132206e-05, "loss": 0.5193, "step": 523 }, { "epoch": 0.48835041938490215, "grad_norm": 0.3166851462963707, "learning_rate": 1.860545391784605e-05, "loss": 0.525, "step": 524 }, { "epoch": 0.48928238583411, "grad_norm": 0.19634160497169895, "learning_rate": 1.859855022437004e-05, "loss": 0.557, "step": 525 }, { "epoch": 0.4902143522833178, "grad_norm": 0.18499245309207957, "learning_rate": 1.859164653089403e-05, "loss": 0.5401, "step": 526 }, { "epoch": 0.4911463187325256, "grad_norm": 0.21760850706511425, "learning_rate": 1.858474283741802e-05, "loss": 0.5127, "step": 527 }, { "epoch": 0.49207828518173347, "grad_norm": 0.16234014318947138, "learning_rate": 1.857783914394201e-05, "loss": 0.5289, "step": 528 }, { "epoch": 0.4930102516309413, "grad_norm": 0.17954425981177907, "learning_rate": 1.8570935450466e-05, "loss": 0.4913, "step": 529 }, { "epoch": 0.4939422180801491, "grad_norm": 0.19539257071909152, "learning_rate": 1.856403175698999e-05, "loss": 0.5289, "step": 530 }, { "epoch": 0.49487418452935694, "grad_norm": 0.2074863367547076, "learning_rate": 1.8557128063513982e-05, "loss": 0.5278, "step": 531 }, { "epoch": 0.4958061509785648, "grad_norm": 0.19825252236501673, "learning_rate": 1.8550224370037974e-05, "loss": 0.5045, "step": 532 }, { "epoch": 0.4967381174277726, "grad_norm": 0.1766318326785503, "learning_rate": 1.8543320676561963e-05, "loss": 0.5392, "step": 533 }, { "epoch": 0.4976700838769804, "grad_norm": 0.22889685253371375, "learning_rate": 1.8536416983085952e-05, "loss": 0.5396, "step": 534 }, { "epoch": 0.49860205032618826, "grad_norm": 0.1664678695286912, "learning_rate": 1.8529513289609945e-05, "loss": 0.4974, "step": 535 }, { "epoch": 0.4995340167753961, "grad_norm": 0.17248812347890632, "learning_rate": 1.8522609596133934e-05, "loss": 0.514, "step": 536 }, { "epoch": 0.5004659832246039, "grad_norm": 0.17272976578120117, "learning_rate": 1.8515705902657923e-05, "loss": 0.4816, "step": 537 }, { "epoch": 0.5013979496738118, "grad_norm": 0.18833036172915124, "learning_rate": 1.8508802209181912e-05, "loss": 0.5247, "step": 538 }, { "epoch": 0.5023299161230196, "grad_norm": 0.20100910277718345, "learning_rate": 1.8501898515705904e-05, "loss": 0.5284, "step": 539 }, { "epoch": 0.5032618825722274, "grad_norm": 0.1913720715611104, "learning_rate": 1.8494994822229893e-05, "loss": 0.4998, "step": 540 }, { "epoch": 0.5041938490214353, "grad_norm": 0.18568799211765682, "learning_rate": 1.8488091128753886e-05, "loss": 0.5108, "step": 541 }, { "epoch": 0.5051258154706431, "grad_norm": 0.17158579466455848, "learning_rate": 1.8481187435277875e-05, "loss": 0.5198, "step": 542 }, { "epoch": 0.5060577819198508, "grad_norm": 0.18822189333573697, "learning_rate": 1.8474283741801867e-05, "loss": 0.5029, "step": 543 }, { "epoch": 0.5069897483690587, "grad_norm": 0.206402785694961, "learning_rate": 1.8467380048325856e-05, "loss": 0.5335, "step": 544 }, { "epoch": 0.5079217148182665, "grad_norm": 0.36484406974193023, "learning_rate": 1.8460476354849845e-05, "loss": 0.518, "step": 545 }, { "epoch": 0.5088536812674743, "grad_norm": 0.19991244565413266, "learning_rate": 1.8453572661373838e-05, "loss": 0.5073, "step": 546 }, { "epoch": 0.5097856477166822, "grad_norm": 0.18343373597229284, "learning_rate": 1.8446668967897827e-05, "loss": 0.5225, "step": 547 }, { "epoch": 0.51071761416589, "grad_norm": 0.16096448519644901, "learning_rate": 1.8439765274421816e-05, "loss": 0.4911, "step": 548 }, { "epoch": 0.5116495806150979, "grad_norm": 0.1920516738365542, "learning_rate": 1.8432861580945808e-05, "loss": 0.5291, "step": 549 }, { "epoch": 0.5125815470643057, "grad_norm": 0.16254219907752918, "learning_rate": 1.8425957887469797e-05, "loss": 0.5263, "step": 550 }, { "epoch": 0.5135135135135135, "grad_norm": 0.20327045577951094, "learning_rate": 1.841905419399379e-05, "loss": 0.5401, "step": 551 }, { "epoch": 0.5144454799627214, "grad_norm": 0.19575891465939374, "learning_rate": 1.841215050051778e-05, "loss": 0.4982, "step": 552 }, { "epoch": 0.5153774464119292, "grad_norm": 0.15955967414158312, "learning_rate": 1.8405246807041768e-05, "loss": 0.4639, "step": 553 }, { "epoch": 0.516309412861137, "grad_norm": 0.17817545938923512, "learning_rate": 1.839834311356576e-05, "loss": 0.5146, "step": 554 }, { "epoch": 0.5172413793103449, "grad_norm": 0.23852345951795625, "learning_rate": 1.839143942008975e-05, "loss": 0.5398, "step": 555 }, { "epoch": 0.5181733457595527, "grad_norm": 0.19563436193913272, "learning_rate": 1.8384535726613738e-05, "loss": 0.5797, "step": 556 }, { "epoch": 0.5191053122087604, "grad_norm": 0.16285269595121646, "learning_rate": 1.837763203313773e-05, "loss": 0.5031, "step": 557 }, { "epoch": 0.5200372786579683, "grad_norm": 0.1721463152928129, "learning_rate": 1.837072833966172e-05, "loss": 0.5638, "step": 558 }, { "epoch": 0.5209692451071761, "grad_norm": 0.17147145749792783, "learning_rate": 1.8363824646185712e-05, "loss": 0.5197, "step": 559 }, { "epoch": 0.5219012115563839, "grad_norm": 0.18873359746643192, "learning_rate": 1.83569209527097e-05, "loss": 0.5063, "step": 560 }, { "epoch": 0.5228331780055918, "grad_norm": 0.17620583646389543, "learning_rate": 1.8350017259233694e-05, "loss": 0.5251, "step": 561 }, { "epoch": 0.5237651444547996, "grad_norm": 0.21169023731105177, "learning_rate": 1.8343113565757683e-05, "loss": 0.5217, "step": 562 }, { "epoch": 0.5246971109040075, "grad_norm": 0.17129579768581032, "learning_rate": 1.833620987228167e-05, "loss": 0.4974, "step": 563 }, { "epoch": 0.5256290773532153, "grad_norm": 0.16735399920477945, "learning_rate": 1.832930617880566e-05, "loss": 0.5085, "step": 564 }, { "epoch": 0.5265610438024231, "grad_norm": 0.17622593598221648, "learning_rate": 1.8322402485329653e-05, "loss": 0.4852, "step": 565 }, { "epoch": 0.527493010251631, "grad_norm": 0.18259663017318603, "learning_rate": 1.8315498791853642e-05, "loss": 0.5185, "step": 566 }, { "epoch": 0.5284249767008388, "grad_norm": 0.17475159403770457, "learning_rate": 1.830859509837763e-05, "loss": 0.5395, "step": 567 }, { "epoch": 0.5293569431500466, "grad_norm": 0.15914733385342594, "learning_rate": 1.8301691404901624e-05, "loss": 0.4771, "step": 568 }, { "epoch": 0.5302889095992545, "grad_norm": 0.16550871717071958, "learning_rate": 1.8294787711425616e-05, "loss": 0.5011, "step": 569 }, { "epoch": 0.5312208760484622, "grad_norm": 0.21419688537515805, "learning_rate": 1.8287884017949605e-05, "loss": 0.5022, "step": 570 }, { "epoch": 0.53215284249767, "grad_norm": 0.1632966486452505, "learning_rate": 1.8280980324473594e-05, "loss": 0.4858, "step": 571 }, { "epoch": 0.5330848089468779, "grad_norm": 0.18426053536703568, "learning_rate": 1.8274076630997587e-05, "loss": 0.5276, "step": 572 }, { "epoch": 0.5340167753960857, "grad_norm": 0.18181660267576413, "learning_rate": 1.8267172937521576e-05, "loss": 0.5321, "step": 573 }, { "epoch": 0.5349487418452936, "grad_norm": 0.17914675938288488, "learning_rate": 1.8260269244045565e-05, "loss": 0.5184, "step": 574 }, { "epoch": 0.5358807082945014, "grad_norm": 0.16037396954560937, "learning_rate": 1.8253365550569557e-05, "loss": 0.5085, "step": 575 }, { "epoch": 0.5368126747437092, "grad_norm": 0.16878815803528888, "learning_rate": 1.8246461857093546e-05, "loss": 0.4963, "step": 576 }, { "epoch": 0.5377446411929171, "grad_norm": 0.16569568142984206, "learning_rate": 1.8239558163617535e-05, "loss": 0.508, "step": 577 }, { "epoch": 0.5386766076421249, "grad_norm": 0.1632554387448227, "learning_rate": 1.8232654470141528e-05, "loss": 0.507, "step": 578 }, { "epoch": 0.5396085740913327, "grad_norm": 0.1709890933151334, "learning_rate": 1.822575077666552e-05, "loss": 0.5238, "step": 579 }, { "epoch": 0.5405405405405406, "grad_norm": 0.18966044110518246, "learning_rate": 1.821884708318951e-05, "loss": 0.5366, "step": 580 }, { "epoch": 0.5414725069897484, "grad_norm": 0.17937600430401834, "learning_rate": 1.8211943389713498e-05, "loss": 0.5236, "step": 581 }, { "epoch": 0.5424044734389561, "grad_norm": 0.18416546627083916, "learning_rate": 1.8205039696237487e-05, "loss": 0.526, "step": 582 }, { "epoch": 0.543336439888164, "grad_norm": 0.1609633439286662, "learning_rate": 1.819813600276148e-05, "loss": 0.4779, "step": 583 }, { "epoch": 0.5442684063373718, "grad_norm": 0.18980194505942544, "learning_rate": 1.819123230928547e-05, "loss": 0.5234, "step": 584 }, { "epoch": 0.5452003727865797, "grad_norm": 0.17615157713250712, "learning_rate": 1.8184328615809458e-05, "loss": 0.503, "step": 585 }, { "epoch": 0.5461323392357875, "grad_norm": 0.18301491909046647, "learning_rate": 1.817742492233345e-05, "loss": 0.5009, "step": 586 }, { "epoch": 0.5470643056849953, "grad_norm": 0.16697231664173517, "learning_rate": 1.817052122885744e-05, "loss": 0.4659, "step": 587 }, { "epoch": 0.5479962721342032, "grad_norm": 0.15886221957234162, "learning_rate": 1.816361753538143e-05, "loss": 0.4972, "step": 588 }, { "epoch": 0.548928238583411, "grad_norm": 0.19979541644291782, "learning_rate": 1.815671384190542e-05, "loss": 0.5687, "step": 589 }, { "epoch": 0.5498602050326188, "grad_norm": 0.18847669084102434, "learning_rate": 1.8149810148429413e-05, "loss": 0.5094, "step": 590 }, { "epoch": 0.5507921714818267, "grad_norm": 0.17115954356176036, "learning_rate": 1.8142906454953402e-05, "loss": 0.514, "step": 591 }, { "epoch": 0.5517241379310345, "grad_norm": 0.30078825365359035, "learning_rate": 1.813600276147739e-05, "loss": 0.4997, "step": 592 }, { "epoch": 0.5526561043802423, "grad_norm": 0.18370059418663875, "learning_rate": 1.812909906800138e-05, "loss": 0.4924, "step": 593 }, { "epoch": 0.5535880708294502, "grad_norm": 0.1798254199921798, "learning_rate": 1.8122195374525372e-05, "loss": 0.531, "step": 594 }, { "epoch": 0.554520037278658, "grad_norm": 0.21394673861338037, "learning_rate": 1.811529168104936e-05, "loss": 0.5174, "step": 595 }, { "epoch": 0.5554520037278659, "grad_norm": 0.18758066999212283, "learning_rate": 1.8108387987573354e-05, "loss": 0.5023, "step": 596 }, { "epoch": 0.5563839701770736, "grad_norm": 0.15906307882028692, "learning_rate": 1.8101484294097343e-05, "loss": 0.4957, "step": 597 }, { "epoch": 0.5573159366262814, "grad_norm": 0.23113986990166158, "learning_rate": 1.8094580600621335e-05, "loss": 0.5135, "step": 598 }, { "epoch": 0.5582479030754893, "grad_norm": 0.17108255731172997, "learning_rate": 1.8087676907145324e-05, "loss": 0.5225, "step": 599 }, { "epoch": 0.5591798695246971, "grad_norm": 0.1570970988394945, "learning_rate": 1.8080773213669313e-05, "loss": 0.5074, "step": 600 }, { "epoch": 0.5601118359739049, "grad_norm": 0.18999330971388906, "learning_rate": 1.8073869520193306e-05, "loss": 0.5063, "step": 601 }, { "epoch": 0.5610438024231128, "grad_norm": 0.1735271418292203, "learning_rate": 1.8066965826717295e-05, "loss": 0.5176, "step": 602 }, { "epoch": 0.5619757688723206, "grad_norm": 0.16375004463199755, "learning_rate": 1.8060062133241284e-05, "loss": 0.5191, "step": 603 }, { "epoch": 0.5629077353215284, "grad_norm": 0.17526348803927638, "learning_rate": 1.8053158439765276e-05, "loss": 0.5114, "step": 604 }, { "epoch": 0.5638397017707363, "grad_norm": 0.16031440690860954, "learning_rate": 1.8046254746289265e-05, "loss": 0.483, "step": 605 }, { "epoch": 0.5647716682199441, "grad_norm": 0.22762433216928124, "learning_rate": 1.8039351052813258e-05, "loss": 0.5486, "step": 606 }, { "epoch": 0.5657036346691519, "grad_norm": 0.1647780950332057, "learning_rate": 1.8032447359337247e-05, "loss": 0.5077, "step": 607 }, { "epoch": 0.5666356011183598, "grad_norm": 0.1805644007690387, "learning_rate": 1.802554366586124e-05, "loss": 0.5379, "step": 608 }, { "epoch": 0.5675675675675675, "grad_norm": 0.18443886727498712, "learning_rate": 1.801863997238523e-05, "loss": 0.514, "step": 609 }, { "epoch": 0.5684995340167754, "grad_norm": 0.20893738645562485, "learning_rate": 1.8011736278909217e-05, "loss": 0.4979, "step": 610 }, { "epoch": 0.5694315004659832, "grad_norm": 0.15809534977014494, "learning_rate": 1.8004832585433206e-05, "loss": 0.4934, "step": 611 }, { "epoch": 0.570363466915191, "grad_norm": 0.16310627407075087, "learning_rate": 1.79979288919572e-05, "loss": 0.4848, "step": 612 }, { "epoch": 0.5712954333643989, "grad_norm": 0.1884332644556878, "learning_rate": 1.7991025198481188e-05, "loss": 0.5354, "step": 613 }, { "epoch": 0.5722273998136067, "grad_norm": 0.19266492947587632, "learning_rate": 1.798412150500518e-05, "loss": 0.5184, "step": 614 }, { "epoch": 0.5731593662628145, "grad_norm": 0.16870213571260384, "learning_rate": 1.797721781152917e-05, "loss": 0.498, "step": 615 }, { "epoch": 0.5740913327120224, "grad_norm": 0.21155165462198205, "learning_rate": 1.7970314118053162e-05, "loss": 0.5205, "step": 616 }, { "epoch": 0.5750232991612302, "grad_norm": 0.1712507018387935, "learning_rate": 1.796341042457715e-05, "loss": 0.4884, "step": 617 }, { "epoch": 0.575955265610438, "grad_norm": 0.1616778817013356, "learning_rate": 1.795650673110114e-05, "loss": 0.5115, "step": 618 }, { "epoch": 0.5768872320596459, "grad_norm": 0.154468928331482, "learning_rate": 1.7949603037625132e-05, "loss": 0.4965, "step": 619 }, { "epoch": 0.5778191985088537, "grad_norm": 0.19803526099108634, "learning_rate": 1.794269934414912e-05, "loss": 0.4907, "step": 620 }, { "epoch": 0.5787511649580616, "grad_norm": 0.15590327273868554, "learning_rate": 1.793579565067311e-05, "loss": 0.5008, "step": 621 }, { "epoch": 0.5796831314072693, "grad_norm": 0.29287690221141877, "learning_rate": 1.79288919571971e-05, "loss": 0.5458, "step": 622 }, { "epoch": 0.5806150978564771, "grad_norm": 0.17725293149683152, "learning_rate": 1.7921988263721092e-05, "loss": 0.5174, "step": 623 }, { "epoch": 0.581547064305685, "grad_norm": 0.19706750269060672, "learning_rate": 1.7915084570245084e-05, "loss": 0.531, "step": 624 }, { "epoch": 0.5824790307548928, "grad_norm": 0.21317331574594703, "learning_rate": 1.7908180876769073e-05, "loss": 0.5463, "step": 625 }, { "epoch": 0.5834109972041006, "grad_norm": 0.2567598088084227, "learning_rate": 1.7901277183293066e-05, "loss": 0.5309, "step": 626 }, { "epoch": 0.5843429636533085, "grad_norm": 0.18574633886976027, "learning_rate": 1.7894373489817055e-05, "loss": 0.5494, "step": 627 }, { "epoch": 0.5852749301025163, "grad_norm": 0.17389611055515694, "learning_rate": 1.7887469796341044e-05, "loss": 0.5211, "step": 628 }, { "epoch": 0.5862068965517241, "grad_norm": 0.18657995828116308, "learning_rate": 1.7880566102865033e-05, "loss": 0.4982, "step": 629 }, { "epoch": 0.587138863000932, "grad_norm": 0.17470818991978482, "learning_rate": 1.7873662409389025e-05, "loss": 0.4973, "step": 630 }, { "epoch": 0.5880708294501398, "grad_norm": 0.16715716575234796, "learning_rate": 1.7866758715913014e-05, "loss": 0.502, "step": 631 }, { "epoch": 0.5890027958993477, "grad_norm": 0.23859607261014212, "learning_rate": 1.7859855022437003e-05, "loss": 0.5111, "step": 632 }, { "epoch": 0.5899347623485555, "grad_norm": 0.1942824396245023, "learning_rate": 1.7852951328960996e-05, "loss": 0.5141, "step": 633 }, { "epoch": 0.5908667287977633, "grad_norm": 0.1740303664297931, "learning_rate": 1.7846047635484988e-05, "loss": 0.5438, "step": 634 }, { "epoch": 0.5917986952469712, "grad_norm": 0.1832414222997951, "learning_rate": 1.7839143942008977e-05, "loss": 0.5364, "step": 635 }, { "epoch": 0.5927306616961789, "grad_norm": 0.170902061003363, "learning_rate": 1.7832240248532966e-05, "loss": 0.5125, "step": 636 }, { "epoch": 0.5936626281453867, "grad_norm": 0.19250672236243016, "learning_rate": 1.782533655505696e-05, "loss": 0.5378, "step": 637 }, { "epoch": 0.5945945945945946, "grad_norm": 0.18067824188183174, "learning_rate": 1.7818432861580948e-05, "loss": 0.5063, "step": 638 }, { "epoch": 0.5955265610438024, "grad_norm": 0.154958128102209, "learning_rate": 1.7811529168104937e-05, "loss": 0.5148, "step": 639 }, { "epoch": 0.5964585274930102, "grad_norm": 0.16522326722314967, "learning_rate": 1.7804625474628926e-05, "loss": 0.49, "step": 640 }, { "epoch": 0.5973904939422181, "grad_norm": 0.17838155578167564, "learning_rate": 1.7797721781152918e-05, "loss": 0.5053, "step": 641 }, { "epoch": 0.5983224603914259, "grad_norm": 0.16737659153983134, "learning_rate": 1.7790818087676907e-05, "loss": 0.5361, "step": 642 }, { "epoch": 0.5992544268406338, "grad_norm": 0.16027909440058996, "learning_rate": 1.77839143942009e-05, "loss": 0.5155, "step": 643 }, { "epoch": 0.6001863932898416, "grad_norm": 0.16172586121879665, "learning_rate": 1.777701070072489e-05, "loss": 0.5027, "step": 644 }, { "epoch": 0.6011183597390494, "grad_norm": 0.16426741815127152, "learning_rate": 1.777010700724888e-05, "loss": 0.4989, "step": 645 }, { "epoch": 0.6020503261882573, "grad_norm": 0.17568362475914448, "learning_rate": 1.776320331377287e-05, "loss": 0.5226, "step": 646 }, { "epoch": 0.6029822926374651, "grad_norm": 0.1769640885442992, "learning_rate": 1.775629962029686e-05, "loss": 0.5357, "step": 647 }, { "epoch": 0.6039142590866728, "grad_norm": 0.17563783555934953, "learning_rate": 1.774939592682085e-05, "loss": 0.5338, "step": 648 }, { "epoch": 0.6048462255358807, "grad_norm": 0.17865292266424776, "learning_rate": 1.774249223334484e-05, "loss": 0.502, "step": 649 }, { "epoch": 0.6057781919850885, "grad_norm": 0.15846277969709888, "learning_rate": 1.773558853986883e-05, "loss": 0.5112, "step": 650 }, { "epoch": 0.6067101584342963, "grad_norm": 0.18388762546780646, "learning_rate": 1.7728684846392822e-05, "loss": 0.5322, "step": 651 }, { "epoch": 0.6076421248835042, "grad_norm": 0.18782919240043158, "learning_rate": 1.772178115291681e-05, "loss": 0.5218, "step": 652 }, { "epoch": 0.608574091332712, "grad_norm": 0.19253464557567312, "learning_rate": 1.7714877459440804e-05, "loss": 0.4986, "step": 653 }, { "epoch": 0.6095060577819198, "grad_norm": 0.17709939661782054, "learning_rate": 1.7707973765964793e-05, "loss": 0.4968, "step": 654 }, { "epoch": 0.6104380242311277, "grad_norm": 0.17651795113562158, "learning_rate": 1.7701070072488782e-05, "loss": 0.5384, "step": 655 }, { "epoch": 0.6113699906803355, "grad_norm": 0.17859411538064646, "learning_rate": 1.7694166379012774e-05, "loss": 0.5106, "step": 656 }, { "epoch": 0.6123019571295434, "grad_norm": 0.17621769244453742, "learning_rate": 1.7687262685536763e-05, "loss": 0.5137, "step": 657 }, { "epoch": 0.6132339235787512, "grad_norm": 0.1630065830737473, "learning_rate": 1.7680358992060752e-05, "loss": 0.4739, "step": 658 }, { "epoch": 0.614165890027959, "grad_norm": 0.1951288717846004, "learning_rate": 1.7673455298584745e-05, "loss": 0.47, "step": 659 }, { "epoch": 0.6150978564771669, "grad_norm": 0.17169362260089477, "learning_rate": 1.7666551605108734e-05, "loss": 0.5096, "step": 660 }, { "epoch": 0.6160298229263746, "grad_norm": 0.19123117875714055, "learning_rate": 1.7659647911632726e-05, "loss": 0.4792, "step": 661 }, { "epoch": 0.6169617893755824, "grad_norm": 0.16585639858768703, "learning_rate": 1.7652744218156715e-05, "loss": 0.4826, "step": 662 }, { "epoch": 0.6178937558247903, "grad_norm": 0.23901456068315716, "learning_rate": 1.7645840524680708e-05, "loss": 0.5227, "step": 663 }, { "epoch": 0.6188257222739981, "grad_norm": 0.18588986830919504, "learning_rate": 1.7638936831204697e-05, "loss": 0.5089, "step": 664 }, { "epoch": 0.6197576887232059, "grad_norm": 0.1957706750809611, "learning_rate": 1.7632033137728686e-05, "loss": 0.5014, "step": 665 }, { "epoch": 0.6206896551724138, "grad_norm": 0.19538578750004876, "learning_rate": 1.7625129444252678e-05, "loss": 0.5116, "step": 666 }, { "epoch": 0.6216216216216216, "grad_norm": 0.1937050527947627, "learning_rate": 1.7618225750776667e-05, "loss": 0.496, "step": 667 }, { "epoch": 0.6225535880708295, "grad_norm": 0.17589758175752096, "learning_rate": 1.7611322057300656e-05, "loss": 0.4748, "step": 668 }, { "epoch": 0.6234855545200373, "grad_norm": 0.17385792946320744, "learning_rate": 1.7604418363824645e-05, "loss": 0.5137, "step": 669 }, { "epoch": 0.6244175209692451, "grad_norm": 0.18529927411735303, "learning_rate": 1.7597514670348638e-05, "loss": 0.4839, "step": 670 }, { "epoch": 0.625349487418453, "grad_norm": 0.22138644221738044, "learning_rate": 1.759061097687263e-05, "loss": 0.5239, "step": 671 }, { "epoch": 0.6262814538676608, "grad_norm": 0.16365010358696053, "learning_rate": 1.758370728339662e-05, "loss": 0.5228, "step": 672 }, { "epoch": 0.6272134203168686, "grad_norm": 0.18169684600506325, "learning_rate": 1.7576803589920608e-05, "loss": 0.5046, "step": 673 }, { "epoch": 0.6281453867660765, "grad_norm": 0.1721591187165702, "learning_rate": 1.75698998964446e-05, "loss": 0.4969, "step": 674 }, { "epoch": 0.6290773532152842, "grad_norm": 0.16210356170633622, "learning_rate": 1.756299620296859e-05, "loss": 0.4834, "step": 675 }, { "epoch": 0.630009319664492, "grad_norm": 0.17567336579635628, "learning_rate": 1.755609250949258e-05, "loss": 0.5165, "step": 676 }, { "epoch": 0.6309412861136999, "grad_norm": 0.17691897229712267, "learning_rate": 1.754918881601657e-05, "loss": 0.4886, "step": 677 }, { "epoch": 0.6318732525629077, "grad_norm": 0.16602841781920183, "learning_rate": 1.754228512254056e-05, "loss": 0.4805, "step": 678 }, { "epoch": 0.6328052190121156, "grad_norm": 0.18183338819623693, "learning_rate": 1.753538142906455e-05, "loss": 0.5163, "step": 679 }, { "epoch": 0.6337371854613234, "grad_norm": 0.17300339627330835, "learning_rate": 1.752847773558854e-05, "loss": 0.5413, "step": 680 }, { "epoch": 0.6346691519105312, "grad_norm": 0.18172530876742163, "learning_rate": 1.7521574042112534e-05, "loss": 0.539, "step": 681 }, { "epoch": 0.6356011183597391, "grad_norm": 0.20128361717227544, "learning_rate": 1.7514670348636523e-05, "loss": 0.5115, "step": 682 }, { "epoch": 0.6365330848089469, "grad_norm": 0.1944533130355394, "learning_rate": 1.7507766655160512e-05, "loss": 0.5225, "step": 683 }, { "epoch": 0.6374650512581547, "grad_norm": 0.1811440861862893, "learning_rate": 1.75008629616845e-05, "loss": 0.5295, "step": 684 }, { "epoch": 0.6383970177073626, "grad_norm": 0.16878660756024405, "learning_rate": 1.7493959268208494e-05, "loss": 0.5016, "step": 685 }, { "epoch": 0.6393289841565704, "grad_norm": 0.26018141453094146, "learning_rate": 1.7487055574732483e-05, "loss": 0.5061, "step": 686 }, { "epoch": 0.6402609506057781, "grad_norm": 0.17550505715634465, "learning_rate": 1.748015188125647e-05, "loss": 0.5203, "step": 687 }, { "epoch": 0.641192917054986, "grad_norm": 0.1722597959312706, "learning_rate": 1.7473248187780464e-05, "loss": 0.4939, "step": 688 }, { "epoch": 0.6421248835041938, "grad_norm": 0.16817867179169269, "learning_rate": 1.7466344494304453e-05, "loss": 0.506, "step": 689 }, { "epoch": 0.6430568499534017, "grad_norm": 0.17548582838561874, "learning_rate": 1.7459440800828446e-05, "loss": 0.4773, "step": 690 }, { "epoch": 0.6439888164026095, "grad_norm": 0.1527096763562513, "learning_rate": 1.7452537107352435e-05, "loss": 0.5132, "step": 691 }, { "epoch": 0.6449207828518173, "grad_norm": 0.1677429486892361, "learning_rate": 1.7445633413876427e-05, "loss": 0.493, "step": 692 }, { "epoch": 0.6458527493010252, "grad_norm": 0.1900760384064093, "learning_rate": 1.7438729720400416e-05, "loss": 0.502, "step": 693 }, { "epoch": 0.646784715750233, "grad_norm": 0.1631919623524759, "learning_rate": 1.7431826026924405e-05, "loss": 0.5206, "step": 694 }, { "epoch": 0.6477166821994408, "grad_norm": 0.15540003188951934, "learning_rate": 1.7424922333448394e-05, "loss": 0.4953, "step": 695 }, { "epoch": 0.6486486486486487, "grad_norm": 0.1789800694156746, "learning_rate": 1.7418018639972387e-05, "loss": 0.5413, "step": 696 }, { "epoch": 0.6495806150978565, "grad_norm": 0.19106143468685116, "learning_rate": 1.7411114946496376e-05, "loss": 0.5169, "step": 697 }, { "epoch": 0.6505125815470643, "grad_norm": 0.1714951729918087, "learning_rate": 1.7404211253020368e-05, "loss": 0.5118, "step": 698 }, { "epoch": 0.6514445479962722, "grad_norm": 0.17920865399006883, "learning_rate": 1.7397307559544357e-05, "loss": 0.5145, "step": 699 }, { "epoch": 0.65237651444548, "grad_norm": 0.17229976492961527, "learning_rate": 1.739040386606835e-05, "loss": 0.4883, "step": 700 }, { "epoch": 0.6533084808946877, "grad_norm": 0.14807949249253122, "learning_rate": 1.738350017259234e-05, "loss": 0.472, "step": 701 }, { "epoch": 0.6542404473438956, "grad_norm": 0.1801793925277413, "learning_rate": 1.7376596479116328e-05, "loss": 0.5503, "step": 702 }, { "epoch": 0.6551724137931034, "grad_norm": 0.20331587669770407, "learning_rate": 1.736969278564032e-05, "loss": 0.5383, "step": 703 }, { "epoch": 0.6561043802423113, "grad_norm": 0.17651064395437607, "learning_rate": 1.736278909216431e-05, "loss": 0.5075, "step": 704 }, { "epoch": 0.6570363466915191, "grad_norm": 0.1779570773194271, "learning_rate": 1.7355885398688298e-05, "loss": 0.4758, "step": 705 }, { "epoch": 0.6579683131407269, "grad_norm": 0.1567094500269976, "learning_rate": 1.734898170521229e-05, "loss": 0.4794, "step": 706 }, { "epoch": 0.6589002795899348, "grad_norm": 0.17110982969549005, "learning_rate": 1.734207801173628e-05, "loss": 0.5297, "step": 707 }, { "epoch": 0.6598322460391426, "grad_norm": 0.19651202395595754, "learning_rate": 1.7335174318260272e-05, "loss": 0.5034, "step": 708 }, { "epoch": 0.6607642124883504, "grad_norm": 0.18635293389715726, "learning_rate": 1.732827062478426e-05, "loss": 0.5132, "step": 709 }, { "epoch": 0.6616961789375583, "grad_norm": 0.18135795633192353, "learning_rate": 1.7321366931308253e-05, "loss": 0.519, "step": 710 }, { "epoch": 0.6626281453867661, "grad_norm": 0.16524531262932945, "learning_rate": 1.7314463237832242e-05, "loss": 0.5058, "step": 711 }, { "epoch": 0.6635601118359739, "grad_norm": 0.1765682840362793, "learning_rate": 1.730755954435623e-05, "loss": 0.4865, "step": 712 }, { "epoch": 0.6644920782851818, "grad_norm": 0.183186983010839, "learning_rate": 1.730065585088022e-05, "loss": 0.4983, "step": 713 }, { "epoch": 0.6654240447343895, "grad_norm": 0.16430824425696242, "learning_rate": 1.7293752157404213e-05, "loss": 0.4835, "step": 714 }, { "epoch": 0.6663560111835974, "grad_norm": 0.17007124213297437, "learning_rate": 1.7286848463928202e-05, "loss": 0.5021, "step": 715 }, { "epoch": 0.6672879776328052, "grad_norm": 0.17639604353383864, "learning_rate": 1.7279944770452194e-05, "loss": 0.501, "step": 716 }, { "epoch": 0.668219944082013, "grad_norm": 0.19953739826696856, "learning_rate": 1.7273041076976183e-05, "loss": 0.5016, "step": 717 }, { "epoch": 0.6691519105312209, "grad_norm": 0.1845785248915776, "learning_rate": 1.7266137383500176e-05, "loss": 0.5272, "step": 718 }, { "epoch": 0.6700838769804287, "grad_norm": 0.185931864519549, "learning_rate": 1.7259233690024165e-05, "loss": 0.4923, "step": 719 }, { "epoch": 0.6710158434296365, "grad_norm": 0.18367963293970782, "learning_rate": 1.7252329996548154e-05, "loss": 0.5213, "step": 720 }, { "epoch": 0.6719478098788444, "grad_norm": 0.2059514586384201, "learning_rate": 1.7245426303072146e-05, "loss": 0.5319, "step": 721 }, { "epoch": 0.6728797763280522, "grad_norm": 0.16175965605722845, "learning_rate": 1.7238522609596135e-05, "loss": 0.4745, "step": 722 }, { "epoch": 0.67381174277726, "grad_norm": 0.17476249526743076, "learning_rate": 1.7231618916120124e-05, "loss": 0.5208, "step": 723 }, { "epoch": 0.6747437092264679, "grad_norm": 0.1810279311963977, "learning_rate": 1.7224715222644113e-05, "loss": 0.5086, "step": 724 }, { "epoch": 0.6756756756756757, "grad_norm": 0.17309568817327858, "learning_rate": 1.7217811529168106e-05, "loss": 0.4969, "step": 725 }, { "epoch": 0.6766076421248836, "grad_norm": 0.18429794310980235, "learning_rate": 1.72109078356921e-05, "loss": 0.5006, "step": 726 }, { "epoch": 0.6775396085740913, "grad_norm": 0.17639654175166283, "learning_rate": 1.7204004142216087e-05, "loss": 0.5085, "step": 727 }, { "epoch": 0.6784715750232991, "grad_norm": 0.17558951013551358, "learning_rate": 1.719710044874008e-05, "loss": 0.4875, "step": 728 }, { "epoch": 0.679403541472507, "grad_norm": 0.156308034257456, "learning_rate": 1.719019675526407e-05, "loss": 0.4872, "step": 729 }, { "epoch": 0.6803355079217148, "grad_norm": 0.16552931408945154, "learning_rate": 1.7183293061788058e-05, "loss": 0.5109, "step": 730 }, { "epoch": 0.6812674743709226, "grad_norm": 0.16946227757376983, "learning_rate": 1.7176389368312047e-05, "loss": 0.4908, "step": 731 }, { "epoch": 0.6821994408201305, "grad_norm": 0.17145877009520327, "learning_rate": 1.716948567483604e-05, "loss": 0.5024, "step": 732 }, { "epoch": 0.6831314072693383, "grad_norm": 0.17225196759945008, "learning_rate": 1.716258198136003e-05, "loss": 0.5116, "step": 733 }, { "epoch": 0.6840633737185461, "grad_norm": 0.16811504143097675, "learning_rate": 1.7155678287884017e-05, "loss": 0.5069, "step": 734 }, { "epoch": 0.684995340167754, "grad_norm": 0.1635067400645435, "learning_rate": 1.714877459440801e-05, "loss": 0.5119, "step": 735 }, { "epoch": 0.6859273066169618, "grad_norm": 0.16961835902315078, "learning_rate": 1.7141870900932002e-05, "loss": 0.5091, "step": 736 }, { "epoch": 0.6868592730661697, "grad_norm": 0.19039842024318246, "learning_rate": 1.713496720745599e-05, "loss": 0.5106, "step": 737 }, { "epoch": 0.6877912395153775, "grad_norm": 0.1560512956634307, "learning_rate": 1.712806351397998e-05, "loss": 0.4785, "step": 738 }, { "epoch": 0.6887232059645852, "grad_norm": 0.1637913097062564, "learning_rate": 1.7121159820503973e-05, "loss": 0.515, "step": 739 }, { "epoch": 0.6896551724137931, "grad_norm": 0.17788579171599223, "learning_rate": 1.7114256127027962e-05, "loss": 0.4747, "step": 740 }, { "epoch": 0.6905871388630009, "grad_norm": 0.17141147925831854, "learning_rate": 1.710735243355195e-05, "loss": 0.4837, "step": 741 }, { "epoch": 0.6915191053122087, "grad_norm": 0.15580655129452659, "learning_rate": 1.710044874007594e-05, "loss": 0.4845, "step": 742 }, { "epoch": 0.6924510717614166, "grad_norm": 0.15585231044218426, "learning_rate": 1.7093545046599932e-05, "loss": 0.4734, "step": 743 }, { "epoch": 0.6933830382106244, "grad_norm": 0.1687837345232117, "learning_rate": 1.708664135312392e-05, "loss": 0.5072, "step": 744 }, { "epoch": 0.6943150046598322, "grad_norm": 0.1795843818886789, "learning_rate": 1.7079737659647914e-05, "loss": 0.5098, "step": 745 }, { "epoch": 0.6952469711090401, "grad_norm": 0.15721357973509473, "learning_rate": 1.7072833966171903e-05, "loss": 0.4718, "step": 746 }, { "epoch": 0.6961789375582479, "grad_norm": 0.1851235359591318, "learning_rate": 1.7065930272695895e-05, "loss": 0.5091, "step": 747 }, { "epoch": 0.6971109040074557, "grad_norm": 0.15668741631352134, "learning_rate": 1.7059026579219884e-05, "loss": 0.4942, "step": 748 }, { "epoch": 0.6980428704566636, "grad_norm": 0.16498439050040745, "learning_rate": 1.7052122885743873e-05, "loss": 0.5154, "step": 749 }, { "epoch": 0.6989748369058714, "grad_norm": 0.15080651973897571, "learning_rate": 1.7045219192267866e-05, "loss": 0.4928, "step": 750 }, { "epoch": 0.6999068033550793, "grad_norm": 0.16605399812423088, "learning_rate": 1.7038315498791855e-05, "loss": 0.5136, "step": 751 }, { "epoch": 0.700838769804287, "grad_norm": 0.15467746416927208, "learning_rate": 1.7031411805315844e-05, "loss": 0.4832, "step": 752 }, { "epoch": 0.7017707362534948, "grad_norm": 0.15854150622124003, "learning_rate": 1.7024508111839836e-05, "loss": 0.4955, "step": 753 }, { "epoch": 0.7027027027027027, "grad_norm": 0.16419873051034511, "learning_rate": 1.7017604418363825e-05, "loss": 0.4917, "step": 754 }, { "epoch": 0.7036346691519105, "grad_norm": 0.17030147692059175, "learning_rate": 1.7010700724887818e-05, "loss": 0.5125, "step": 755 }, { "epoch": 0.7045666356011183, "grad_norm": 0.16428070272396478, "learning_rate": 1.7003797031411807e-05, "loss": 0.4929, "step": 756 }, { "epoch": 0.7054986020503262, "grad_norm": 0.1572852858094957, "learning_rate": 1.69968933379358e-05, "loss": 0.4984, "step": 757 }, { "epoch": 0.706430568499534, "grad_norm": 0.17570207130919963, "learning_rate": 1.6989989644459788e-05, "loss": 0.5267, "step": 758 }, { "epoch": 0.7073625349487418, "grad_norm": 0.15744219436042214, "learning_rate": 1.6983085950983777e-05, "loss": 0.5107, "step": 759 }, { "epoch": 0.7082945013979497, "grad_norm": 0.16603333746172463, "learning_rate": 1.6976182257507766e-05, "loss": 0.4932, "step": 760 }, { "epoch": 0.7092264678471575, "grad_norm": 0.18152644611947383, "learning_rate": 1.696927856403176e-05, "loss": 0.4931, "step": 761 }, { "epoch": 0.7101584342963654, "grad_norm": 0.17861755890433256, "learning_rate": 1.6962374870555748e-05, "loss": 0.506, "step": 762 }, { "epoch": 0.7110904007455732, "grad_norm": 0.17836312380149413, "learning_rate": 1.695547117707974e-05, "loss": 0.4576, "step": 763 }, { "epoch": 0.712022367194781, "grad_norm": 0.1624434571863427, "learning_rate": 1.694856748360373e-05, "loss": 0.4817, "step": 764 }, { "epoch": 0.7129543336439889, "grad_norm": 0.16816176810872474, "learning_rate": 1.694166379012772e-05, "loss": 0.4847, "step": 765 }, { "epoch": 0.7138863000931966, "grad_norm": 0.17380825305282668, "learning_rate": 1.693476009665171e-05, "loss": 0.5127, "step": 766 }, { "epoch": 0.7148182665424044, "grad_norm": 0.16607985716440019, "learning_rate": 1.69278564031757e-05, "loss": 0.478, "step": 767 }, { "epoch": 0.7157502329916123, "grad_norm": 0.17599126216429706, "learning_rate": 1.6920952709699692e-05, "loss": 0.4902, "step": 768 }, { "epoch": 0.7166821994408201, "grad_norm": 0.19909244542476381, "learning_rate": 1.691404901622368e-05, "loss": 0.5064, "step": 769 }, { "epoch": 0.7176141658900279, "grad_norm": 0.1678734775942278, "learning_rate": 1.690714532274767e-05, "loss": 0.4824, "step": 770 }, { "epoch": 0.7185461323392358, "grad_norm": 0.17891898797302552, "learning_rate": 1.690024162927166e-05, "loss": 0.5084, "step": 771 }, { "epoch": 0.7194780987884436, "grad_norm": 0.193109212520633, "learning_rate": 1.689333793579565e-05, "loss": 0.5348, "step": 772 }, { "epoch": 0.7204100652376515, "grad_norm": 0.17054386703554963, "learning_rate": 1.6886434242319644e-05, "loss": 0.4982, "step": 773 }, { "epoch": 0.7213420316868593, "grad_norm": 0.17245450611170743, "learning_rate": 1.6879530548843633e-05, "loss": 0.5161, "step": 774 }, { "epoch": 0.7222739981360671, "grad_norm": 0.18824482220779895, "learning_rate": 1.6872626855367622e-05, "loss": 0.5023, "step": 775 }, { "epoch": 0.723205964585275, "grad_norm": 0.16994297344571313, "learning_rate": 1.6865723161891615e-05, "loss": 0.5108, "step": 776 }, { "epoch": 0.7241379310344828, "grad_norm": 0.22826952240022802, "learning_rate": 1.6858819468415604e-05, "loss": 0.4846, "step": 777 }, { "epoch": 0.7250698974836906, "grad_norm": 0.1744568030282547, "learning_rate": 1.6851915774939593e-05, "loss": 0.5105, "step": 778 }, { "epoch": 0.7260018639328985, "grad_norm": 0.18329029894036863, "learning_rate": 1.6845012081463585e-05, "loss": 0.4949, "step": 779 }, { "epoch": 0.7269338303821062, "grad_norm": 0.1573308314590483, "learning_rate": 1.6838108387987574e-05, "loss": 0.4638, "step": 780 }, { "epoch": 0.727865796831314, "grad_norm": 0.1653282056292416, "learning_rate": 1.6831204694511563e-05, "loss": 0.4997, "step": 781 }, { "epoch": 0.7287977632805219, "grad_norm": 0.1817476629107315, "learning_rate": 1.6824301001035556e-05, "loss": 0.4999, "step": 782 }, { "epoch": 0.7297297297297297, "grad_norm": 0.17067217316249442, "learning_rate": 1.6817397307559548e-05, "loss": 0.4997, "step": 783 }, { "epoch": 0.7306616961789375, "grad_norm": 0.1597421025713431, "learning_rate": 1.6810493614083537e-05, "loss": 0.5024, "step": 784 }, { "epoch": 0.7315936626281454, "grad_norm": 0.18220990821784913, "learning_rate": 1.6803589920607526e-05, "loss": 0.5025, "step": 785 }, { "epoch": 0.7325256290773532, "grad_norm": 0.18293058129147358, "learning_rate": 1.6796686227131515e-05, "loss": 0.5021, "step": 786 }, { "epoch": 0.7334575955265611, "grad_norm": 0.15930509374438998, "learning_rate": 1.6789782533655508e-05, "loss": 0.4874, "step": 787 }, { "epoch": 0.7343895619757689, "grad_norm": 0.18188789944891856, "learning_rate": 1.6782878840179497e-05, "loss": 0.5103, "step": 788 }, { "epoch": 0.7353215284249767, "grad_norm": 0.1745536225052657, "learning_rate": 1.6775975146703486e-05, "loss": 0.5248, "step": 789 }, { "epoch": 0.7362534948741846, "grad_norm": 0.17277445538979574, "learning_rate": 1.6769071453227478e-05, "loss": 0.51, "step": 790 }, { "epoch": 0.7371854613233924, "grad_norm": 0.16798759984449257, "learning_rate": 1.6762167759751467e-05, "loss": 0.5076, "step": 791 }, { "epoch": 0.7381174277726001, "grad_norm": 0.1733240680005707, "learning_rate": 1.675526406627546e-05, "loss": 0.5192, "step": 792 }, { "epoch": 0.739049394221808, "grad_norm": 0.16379560828456585, "learning_rate": 1.674836037279945e-05, "loss": 0.5048, "step": 793 }, { "epoch": 0.7399813606710158, "grad_norm": 0.19082520200117403, "learning_rate": 1.674145667932344e-05, "loss": 0.479, "step": 794 }, { "epoch": 0.7409133271202236, "grad_norm": 0.1603939500272299, "learning_rate": 1.673455298584743e-05, "loss": 0.4849, "step": 795 }, { "epoch": 0.7418452935694315, "grad_norm": 0.15442363028890918, "learning_rate": 1.672764929237142e-05, "loss": 0.484, "step": 796 }, { "epoch": 0.7427772600186393, "grad_norm": 0.16854987707789837, "learning_rate": 1.672074559889541e-05, "loss": 0.5165, "step": 797 }, { "epoch": 0.7437092264678472, "grad_norm": 0.160878825441787, "learning_rate": 1.67138419054194e-05, "loss": 0.4951, "step": 798 }, { "epoch": 0.744641192917055, "grad_norm": 0.16179569553256537, "learning_rate": 1.670693821194339e-05, "loss": 0.4974, "step": 799 }, { "epoch": 0.7455731593662628, "grad_norm": 0.15541049804822474, "learning_rate": 1.6700034518467382e-05, "loss": 0.485, "step": 800 }, { "epoch": 0.7465051258154707, "grad_norm": 0.18988045572930834, "learning_rate": 1.669313082499137e-05, "loss": 0.5018, "step": 801 }, { "epoch": 0.7474370922646785, "grad_norm": 0.16411245131645122, "learning_rate": 1.6686227131515363e-05, "loss": 0.4897, "step": 802 }, { "epoch": 0.7483690587138863, "grad_norm": 0.16196207406147, "learning_rate": 1.6679323438039353e-05, "loss": 0.5081, "step": 803 }, { "epoch": 0.7493010251630942, "grad_norm": 0.1562005681308501, "learning_rate": 1.667241974456334e-05, "loss": 0.4937, "step": 804 }, { "epoch": 0.750232991612302, "grad_norm": 0.15734687443945516, "learning_rate": 1.6665516051087334e-05, "loss": 0.4888, "step": 805 }, { "epoch": 0.7511649580615097, "grad_norm": 0.17049192623459897, "learning_rate": 1.6658612357611323e-05, "loss": 0.5183, "step": 806 }, { "epoch": 0.7520969245107176, "grad_norm": 0.15779663514690903, "learning_rate": 1.6651708664135312e-05, "loss": 0.5096, "step": 807 }, { "epoch": 0.7530288909599254, "grad_norm": 0.18933791560064653, "learning_rate": 1.6644804970659305e-05, "loss": 0.5291, "step": 808 }, { "epoch": 0.7539608574091333, "grad_norm": 0.17114994695123634, "learning_rate": 1.6637901277183294e-05, "loss": 0.5114, "step": 809 }, { "epoch": 0.7548928238583411, "grad_norm": 0.22269871863699112, "learning_rate": 1.6630997583707286e-05, "loss": 0.5305, "step": 810 }, { "epoch": 0.7558247903075489, "grad_norm": 0.17054548016209414, "learning_rate": 1.6624093890231275e-05, "loss": 0.5085, "step": 811 }, { "epoch": 0.7567567567567568, "grad_norm": 0.17746687766059327, "learning_rate": 1.6617190196755267e-05, "loss": 0.4884, "step": 812 }, { "epoch": 0.7576887232059646, "grad_norm": 0.23280639949172974, "learning_rate": 1.6610286503279256e-05, "loss": 0.5025, "step": 813 }, { "epoch": 0.7586206896551724, "grad_norm": 0.17007743786489657, "learning_rate": 1.6603382809803246e-05, "loss": 0.4992, "step": 814 }, { "epoch": 0.7595526561043803, "grad_norm": 0.1525798499884996, "learning_rate": 1.6596479116327235e-05, "loss": 0.4444, "step": 815 }, { "epoch": 0.7604846225535881, "grad_norm": 0.17774675061661072, "learning_rate": 1.6589575422851227e-05, "loss": 0.4902, "step": 816 }, { "epoch": 0.7614165890027959, "grad_norm": 0.19552487660022838, "learning_rate": 1.6582671729375216e-05, "loss": 0.52, "step": 817 }, { "epoch": 0.7623485554520038, "grad_norm": 0.17036439236387396, "learning_rate": 1.657576803589921e-05, "loss": 0.5163, "step": 818 }, { "epoch": 0.7632805219012115, "grad_norm": 0.1568658859706527, "learning_rate": 1.6568864342423197e-05, "loss": 0.483, "step": 819 }, { "epoch": 0.7642124883504194, "grad_norm": 0.17996685109126406, "learning_rate": 1.656196064894719e-05, "loss": 0.5229, "step": 820 }, { "epoch": 0.7651444547996272, "grad_norm": 0.14915062853220018, "learning_rate": 1.655505695547118e-05, "loss": 0.4773, "step": 821 }, { "epoch": 0.766076421248835, "grad_norm": 0.17441172649208753, "learning_rate": 1.6548153261995168e-05, "loss": 0.5037, "step": 822 }, { "epoch": 0.7670083876980429, "grad_norm": 0.18118428077791887, "learning_rate": 1.654124956851916e-05, "loss": 0.4802, "step": 823 }, { "epoch": 0.7679403541472507, "grad_norm": 0.17068902468932756, "learning_rate": 1.653434587504315e-05, "loss": 0.5156, "step": 824 }, { "epoch": 0.7688723205964585, "grad_norm": 0.17537668956135002, "learning_rate": 1.652744218156714e-05, "loss": 0.5025, "step": 825 }, { "epoch": 0.7698042870456664, "grad_norm": 0.16966584927397504, "learning_rate": 1.6520538488091128e-05, "loss": 0.4894, "step": 826 }, { "epoch": 0.7707362534948742, "grad_norm": 0.16636954873870013, "learning_rate": 1.651363479461512e-05, "loss": 0.4677, "step": 827 }, { "epoch": 0.771668219944082, "grad_norm": 0.18418824431614347, "learning_rate": 1.6506731101139112e-05, "loss": 0.5046, "step": 828 }, { "epoch": 0.7726001863932899, "grad_norm": 0.15662675796050787, "learning_rate": 1.64998274076631e-05, "loss": 0.4901, "step": 829 }, { "epoch": 0.7735321528424977, "grad_norm": 0.17216374222960326, "learning_rate": 1.6492923714187094e-05, "loss": 0.4993, "step": 830 }, { "epoch": 0.7744641192917054, "grad_norm": 0.1895423189828854, "learning_rate": 1.6486020020711083e-05, "loss": 0.5165, "step": 831 }, { "epoch": 0.7753960857409133, "grad_norm": 0.15977059475561395, "learning_rate": 1.6479116327235072e-05, "loss": 0.4918, "step": 832 }, { "epoch": 0.7763280521901211, "grad_norm": 0.16422122297075314, "learning_rate": 1.647221263375906e-05, "loss": 0.4726, "step": 833 }, { "epoch": 0.777260018639329, "grad_norm": 0.17076482080873587, "learning_rate": 1.6465308940283053e-05, "loss": 0.4971, "step": 834 }, { "epoch": 0.7781919850885368, "grad_norm": 0.16283995625131695, "learning_rate": 1.6458405246807042e-05, "loss": 0.5145, "step": 835 }, { "epoch": 0.7791239515377446, "grad_norm": 0.17538760362475855, "learning_rate": 1.645150155333103e-05, "loss": 0.4881, "step": 836 }, { "epoch": 0.7800559179869525, "grad_norm": 0.1727263740870423, "learning_rate": 1.6444597859855024e-05, "loss": 0.4933, "step": 837 }, { "epoch": 0.7809878844361603, "grad_norm": 0.18130666474681364, "learning_rate": 1.6437694166379013e-05, "loss": 0.512, "step": 838 }, { "epoch": 0.7819198508853681, "grad_norm": 0.16423500253582882, "learning_rate": 1.6430790472903005e-05, "loss": 0.4835, "step": 839 }, { "epoch": 0.782851817334576, "grad_norm": 0.18120558385093366, "learning_rate": 1.6423886779426994e-05, "loss": 0.5381, "step": 840 }, { "epoch": 0.7837837837837838, "grad_norm": 0.1687583286072166, "learning_rate": 1.6416983085950987e-05, "loss": 0.5274, "step": 841 }, { "epoch": 0.7847157502329916, "grad_norm": 0.23135891627355815, "learning_rate": 1.6410079392474976e-05, "loss": 0.5105, "step": 842 }, { "epoch": 0.7856477166821995, "grad_norm": 0.17755514594282645, "learning_rate": 1.6403175698998965e-05, "loss": 0.5119, "step": 843 }, { "epoch": 0.7865796831314072, "grad_norm": 0.18690962442200693, "learning_rate": 1.6396272005522954e-05, "loss": 0.51, "step": 844 }, { "epoch": 0.7875116495806151, "grad_norm": 0.18226796295007774, "learning_rate": 1.6389368312046946e-05, "loss": 0.5087, "step": 845 }, { "epoch": 0.7884436160298229, "grad_norm": 0.18101943002701384, "learning_rate": 1.6382464618570935e-05, "loss": 0.521, "step": 846 }, { "epoch": 0.7893755824790307, "grad_norm": 0.15403681768935717, "learning_rate": 1.6375560925094928e-05, "loss": 0.4874, "step": 847 }, { "epoch": 0.7903075489282386, "grad_norm": 0.1886027328288787, "learning_rate": 1.6368657231618917e-05, "loss": 0.4979, "step": 848 }, { "epoch": 0.7912395153774464, "grad_norm": 0.179799490694495, "learning_rate": 1.636175353814291e-05, "loss": 0.4819, "step": 849 }, { "epoch": 0.7921714818266542, "grad_norm": 0.17584942681685697, "learning_rate": 1.63548498446669e-05, "loss": 0.5106, "step": 850 }, { "epoch": 0.7931034482758621, "grad_norm": 0.20701293473589372, "learning_rate": 1.6347946151190887e-05, "loss": 0.5323, "step": 851 }, { "epoch": 0.7940354147250699, "grad_norm": 0.1861735441469847, "learning_rate": 1.634104245771488e-05, "loss": 0.4933, "step": 852 }, { "epoch": 0.7949673811742777, "grad_norm": 0.18038389376399436, "learning_rate": 1.633413876423887e-05, "loss": 0.4792, "step": 853 }, { "epoch": 0.7958993476234856, "grad_norm": 0.16817056739787808, "learning_rate": 1.6327235070762858e-05, "loss": 0.4937, "step": 854 }, { "epoch": 0.7968313140726934, "grad_norm": 0.22085239686400907, "learning_rate": 1.632033137728685e-05, "loss": 0.5191, "step": 855 }, { "epoch": 0.7977632805219013, "grad_norm": 0.2301000307549076, "learning_rate": 1.631342768381084e-05, "loss": 0.5201, "step": 856 }, { "epoch": 0.798695246971109, "grad_norm": 0.16369079838461986, "learning_rate": 1.6306523990334832e-05, "loss": 0.4977, "step": 857 }, { "epoch": 0.7996272134203168, "grad_norm": 0.15646512025354103, "learning_rate": 1.629962029685882e-05, "loss": 0.488, "step": 858 }, { "epoch": 0.8005591798695247, "grad_norm": 0.1848334161378123, "learning_rate": 1.6292716603382813e-05, "loss": 0.561, "step": 859 }, { "epoch": 0.8014911463187325, "grad_norm": 0.14944277163749328, "learning_rate": 1.6285812909906802e-05, "loss": 0.4952, "step": 860 }, { "epoch": 0.8024231127679403, "grad_norm": 0.1633450974348689, "learning_rate": 1.627890921643079e-05, "loss": 0.4722, "step": 861 }, { "epoch": 0.8033550792171482, "grad_norm": 0.1585142267025842, "learning_rate": 1.627200552295478e-05, "loss": 0.516, "step": 862 }, { "epoch": 0.804287045666356, "grad_norm": 0.15577326423594043, "learning_rate": 1.6265101829478773e-05, "loss": 0.5049, "step": 863 }, { "epoch": 0.8052190121155638, "grad_norm": 0.15344668919710405, "learning_rate": 1.6258198136002762e-05, "loss": 0.4495, "step": 864 }, { "epoch": 0.8061509785647717, "grad_norm": 0.16709295626467888, "learning_rate": 1.6251294442526754e-05, "loss": 0.4914, "step": 865 }, { "epoch": 0.8070829450139795, "grad_norm": 0.20004487765099227, "learning_rate": 1.6244390749050743e-05, "loss": 0.5047, "step": 866 }, { "epoch": 0.8080149114631874, "grad_norm": 0.15995999167376249, "learning_rate": 1.6237487055574736e-05, "loss": 0.4877, "step": 867 }, { "epoch": 0.8089468779123952, "grad_norm": 0.16941417449463247, "learning_rate": 1.6230583362098725e-05, "loss": 0.5205, "step": 868 }, { "epoch": 0.809878844361603, "grad_norm": 0.21492859276211165, "learning_rate": 1.6223679668622714e-05, "loss": 0.5143, "step": 869 }, { "epoch": 0.8108108108108109, "grad_norm": 0.17945260870721383, "learning_rate": 1.6216775975146706e-05, "loss": 0.502, "step": 870 }, { "epoch": 0.8117427772600186, "grad_norm": 0.18969248938470315, "learning_rate": 1.6209872281670695e-05, "loss": 0.5201, "step": 871 }, { "epoch": 0.8126747437092264, "grad_norm": 0.15843629337513448, "learning_rate": 1.6202968588194684e-05, "loss": 0.4948, "step": 872 }, { "epoch": 0.8136067101584343, "grad_norm": 0.15479867308826747, "learning_rate": 1.6196064894718673e-05, "loss": 0.4786, "step": 873 }, { "epoch": 0.8145386766076421, "grad_norm": 0.17503288071473747, "learning_rate": 1.6189161201242666e-05, "loss": 0.5067, "step": 874 }, { "epoch": 0.8154706430568499, "grad_norm": 0.15358630644859744, "learning_rate": 1.6182257507766658e-05, "loss": 0.4966, "step": 875 }, { "epoch": 0.8164026095060578, "grad_norm": 0.1503973089552752, "learning_rate": 1.6175353814290647e-05, "loss": 0.4938, "step": 876 }, { "epoch": 0.8173345759552656, "grad_norm": 0.1759524786650609, "learning_rate": 1.6168450120814636e-05, "loss": 0.5134, "step": 877 }, { "epoch": 0.8182665424044734, "grad_norm": 0.16015026970211405, "learning_rate": 1.616154642733863e-05, "loss": 0.4614, "step": 878 }, { "epoch": 0.8191985088536813, "grad_norm": 0.15430970847375025, "learning_rate": 1.6154642733862618e-05, "loss": 0.4936, "step": 879 }, { "epoch": 0.8201304753028891, "grad_norm": 0.17005816127549475, "learning_rate": 1.6147739040386607e-05, "loss": 0.4755, "step": 880 }, { "epoch": 0.821062441752097, "grad_norm": 0.16869297434148292, "learning_rate": 1.61408353469106e-05, "loss": 0.5072, "step": 881 }, { "epoch": 0.8219944082013048, "grad_norm": 0.1916506685278581, "learning_rate": 1.6133931653434588e-05, "loss": 0.502, "step": 882 }, { "epoch": 0.8229263746505125, "grad_norm": 0.16786250087366963, "learning_rate": 1.6127027959958577e-05, "loss": 0.5191, "step": 883 }, { "epoch": 0.8238583410997204, "grad_norm": 0.15714656362937138, "learning_rate": 1.612012426648257e-05, "loss": 0.5031, "step": 884 }, { "epoch": 0.8247903075489282, "grad_norm": 0.17083364817856433, "learning_rate": 1.6113220573006562e-05, "loss": 0.5004, "step": 885 }, { "epoch": 0.825722273998136, "grad_norm": 0.16819721901114193, "learning_rate": 1.610631687953055e-05, "loss": 0.4852, "step": 886 }, { "epoch": 0.8266542404473439, "grad_norm": 0.16374200884888562, "learning_rate": 1.609941318605454e-05, "loss": 0.4852, "step": 887 }, { "epoch": 0.8275862068965517, "grad_norm": 0.15419158283567116, "learning_rate": 1.6092509492578533e-05, "loss": 0.4886, "step": 888 }, { "epoch": 0.8285181733457595, "grad_norm": 0.16654782600106094, "learning_rate": 1.608560579910252e-05, "loss": 0.4877, "step": 889 }, { "epoch": 0.8294501397949674, "grad_norm": 0.168535152102799, "learning_rate": 1.607870210562651e-05, "loss": 0.5048, "step": 890 }, { "epoch": 0.8303821062441752, "grad_norm": 0.17449615328687615, "learning_rate": 1.60717984121505e-05, "loss": 0.5145, "step": 891 }, { "epoch": 0.8313140726933831, "grad_norm": 0.17775259624836984, "learning_rate": 1.6064894718674492e-05, "loss": 0.5579, "step": 892 }, { "epoch": 0.8322460391425909, "grad_norm": 0.16786640001430436, "learning_rate": 1.605799102519848e-05, "loss": 0.4926, "step": 893 }, { "epoch": 0.8331780055917987, "grad_norm": 0.18184531278153768, "learning_rate": 1.6051087331722474e-05, "loss": 0.4966, "step": 894 }, { "epoch": 0.8341099720410066, "grad_norm": 0.15583895647909482, "learning_rate": 1.6044183638246463e-05, "loss": 0.5107, "step": 895 }, { "epoch": 0.8350419384902144, "grad_norm": 0.1519756407758854, "learning_rate": 1.6037279944770455e-05, "loss": 0.5029, "step": 896 }, { "epoch": 0.8359739049394221, "grad_norm": 0.17854789891871953, "learning_rate": 1.6030376251294444e-05, "loss": 0.4735, "step": 897 }, { "epoch": 0.83690587138863, "grad_norm": 0.16200274414708174, "learning_rate": 1.6023472557818433e-05, "loss": 0.512, "step": 898 }, { "epoch": 0.8378378378378378, "grad_norm": 0.1752274173990056, "learning_rate": 1.6016568864342426e-05, "loss": 0.4863, "step": 899 }, { "epoch": 0.8387698042870456, "grad_norm": 0.16369970104744797, "learning_rate": 1.6009665170866415e-05, "loss": 0.4909, "step": 900 }, { "epoch": 0.8397017707362535, "grad_norm": 0.16257173132114508, "learning_rate": 1.6002761477390404e-05, "loss": 0.5118, "step": 901 }, { "epoch": 0.8406337371854613, "grad_norm": 0.16082973845422543, "learning_rate": 1.5995857783914396e-05, "loss": 0.4904, "step": 902 }, { "epoch": 0.8415657036346692, "grad_norm": 0.17584427489224622, "learning_rate": 1.5988954090438385e-05, "loss": 0.5031, "step": 903 }, { "epoch": 0.842497670083877, "grad_norm": 0.17820617133557093, "learning_rate": 1.5982050396962378e-05, "loss": 0.4968, "step": 904 }, { "epoch": 0.8434296365330848, "grad_norm": 0.15852419824253763, "learning_rate": 1.5975146703486367e-05, "loss": 0.4673, "step": 905 }, { "epoch": 0.8443616029822927, "grad_norm": 0.1836204139106037, "learning_rate": 1.5968243010010356e-05, "loss": 0.5285, "step": 906 }, { "epoch": 0.8452935694315005, "grad_norm": 0.184021451511235, "learning_rate": 1.5961339316534348e-05, "loss": 0.5216, "step": 907 }, { "epoch": 0.8462255358807083, "grad_norm": 0.17606969442744524, "learning_rate": 1.5954435623058337e-05, "loss": 0.4935, "step": 908 }, { "epoch": 0.8471575023299162, "grad_norm": 0.16833739845280474, "learning_rate": 1.5947531929582326e-05, "loss": 0.4747, "step": 909 }, { "epoch": 0.848089468779124, "grad_norm": 0.2222186095445494, "learning_rate": 1.594062823610632e-05, "loss": 0.496, "step": 910 }, { "epoch": 0.8490214352283317, "grad_norm": 0.1843407345438744, "learning_rate": 1.5933724542630308e-05, "loss": 0.5099, "step": 911 }, { "epoch": 0.8499534016775396, "grad_norm": 0.15492491314080276, "learning_rate": 1.59268208491543e-05, "loss": 0.4635, "step": 912 }, { "epoch": 0.8508853681267474, "grad_norm": 0.15412884786113412, "learning_rate": 1.591991715567829e-05, "loss": 0.4915, "step": 913 }, { "epoch": 0.8518173345759553, "grad_norm": 0.17717080674201605, "learning_rate": 1.591301346220228e-05, "loss": 0.4855, "step": 914 }, { "epoch": 0.8527493010251631, "grad_norm": 0.1723817123023834, "learning_rate": 1.590610976872627e-05, "loss": 0.5051, "step": 915 }, { "epoch": 0.8536812674743709, "grad_norm": 0.15665645826048102, "learning_rate": 1.589920607525026e-05, "loss": 0.4943, "step": 916 }, { "epoch": 0.8546132339235788, "grad_norm": 0.1664975738225392, "learning_rate": 1.589230238177425e-05, "loss": 0.5074, "step": 917 }, { "epoch": 0.8555452003727866, "grad_norm": 0.21750415726060732, "learning_rate": 1.588539868829824e-05, "loss": 0.4882, "step": 918 }, { "epoch": 0.8564771668219944, "grad_norm": 0.17402590421657765, "learning_rate": 1.587849499482223e-05, "loss": 0.5101, "step": 919 }, { "epoch": 0.8574091332712023, "grad_norm": 0.17885161115321477, "learning_rate": 1.5871591301346222e-05, "loss": 0.5092, "step": 920 }, { "epoch": 0.8583410997204101, "grad_norm": 0.1552245670267141, "learning_rate": 1.586468760787021e-05, "loss": 0.4674, "step": 921 }, { "epoch": 0.8592730661696178, "grad_norm": 0.16488521045245763, "learning_rate": 1.5857783914394204e-05, "loss": 0.4751, "step": 922 }, { "epoch": 0.8602050326188257, "grad_norm": 0.1746424250445681, "learning_rate": 1.5850880220918193e-05, "loss": 0.5016, "step": 923 }, { "epoch": 0.8611369990680335, "grad_norm": 0.15327681891872677, "learning_rate": 1.5843976527442182e-05, "loss": 0.4895, "step": 924 }, { "epoch": 0.8620689655172413, "grad_norm": 0.16597368661942274, "learning_rate": 1.5837072833966174e-05, "loss": 0.4951, "step": 925 }, { "epoch": 0.8630009319664492, "grad_norm": 0.15346901026827578, "learning_rate": 1.5830169140490164e-05, "loss": 0.4743, "step": 926 }, { "epoch": 0.863932898415657, "grad_norm": 0.1592890186195035, "learning_rate": 1.5823265447014153e-05, "loss": 0.4985, "step": 927 }, { "epoch": 0.8648648648648649, "grad_norm": 0.1717675631423993, "learning_rate": 1.5816361753538145e-05, "loss": 0.4781, "step": 928 }, { "epoch": 0.8657968313140727, "grad_norm": 0.17647294001025723, "learning_rate": 1.5809458060062134e-05, "loss": 0.4792, "step": 929 }, { "epoch": 0.8667287977632805, "grad_norm": 0.21577891839610935, "learning_rate": 1.5802554366586126e-05, "loss": 0.4787, "step": 930 }, { "epoch": 0.8676607642124884, "grad_norm": 0.1918344237716205, "learning_rate": 1.5795650673110115e-05, "loss": 0.4982, "step": 931 }, { "epoch": 0.8685927306616962, "grad_norm": 0.19410636332183703, "learning_rate": 1.5788746979634108e-05, "loss": 0.4966, "step": 932 }, { "epoch": 0.869524697110904, "grad_norm": 0.21518387866844516, "learning_rate": 1.5781843286158097e-05, "loss": 0.5148, "step": 933 }, { "epoch": 0.8704566635601119, "grad_norm": 0.1964693853373881, "learning_rate": 1.5774939592682086e-05, "loss": 0.4835, "step": 934 }, { "epoch": 0.8713886300093197, "grad_norm": 0.23623007086088885, "learning_rate": 1.5768035899206075e-05, "loss": 0.5017, "step": 935 }, { "epoch": 0.8723205964585274, "grad_norm": 0.1645648463867194, "learning_rate": 1.5761132205730067e-05, "loss": 0.5038, "step": 936 }, { "epoch": 0.8732525629077353, "grad_norm": 0.1654812498150654, "learning_rate": 1.5754228512254056e-05, "loss": 0.4871, "step": 937 }, { "epoch": 0.8741845293569431, "grad_norm": 0.17167168002589459, "learning_rate": 1.5747324818778046e-05, "loss": 0.4781, "step": 938 }, { "epoch": 0.875116495806151, "grad_norm": 0.18959940511098713, "learning_rate": 1.5740421125302038e-05, "loss": 0.4933, "step": 939 }, { "epoch": 0.8760484622553588, "grad_norm": 0.154910610300903, "learning_rate": 1.5733517431826027e-05, "loss": 0.5018, "step": 940 }, { "epoch": 0.8769804287045666, "grad_norm": 0.15347593281403038, "learning_rate": 1.572661373835002e-05, "loss": 0.4766, "step": 941 }, { "epoch": 0.8779123951537745, "grad_norm": 0.19266823815622358, "learning_rate": 1.571971004487401e-05, "loss": 0.5009, "step": 942 }, { "epoch": 0.8788443616029823, "grad_norm": 0.1858307110005265, "learning_rate": 1.5712806351398e-05, "loss": 0.497, "step": 943 }, { "epoch": 0.8797763280521901, "grad_norm": 0.16931802941375043, "learning_rate": 1.570590265792199e-05, "loss": 0.5293, "step": 944 }, { "epoch": 0.880708294501398, "grad_norm": 0.17223892368951915, "learning_rate": 1.569899896444598e-05, "loss": 0.487, "step": 945 }, { "epoch": 0.8816402609506058, "grad_norm": 0.18008235079010385, "learning_rate": 1.5692095270969968e-05, "loss": 0.5099, "step": 946 }, { "epoch": 0.8825722273998136, "grad_norm": 0.1791301326895336, "learning_rate": 1.568519157749396e-05, "loss": 0.5051, "step": 947 }, { "epoch": 0.8835041938490215, "grad_norm": 0.16123380331948617, "learning_rate": 1.567828788401795e-05, "loss": 0.4956, "step": 948 }, { "epoch": 0.8844361602982292, "grad_norm": 0.17193262480656893, "learning_rate": 1.5671384190541942e-05, "loss": 0.5194, "step": 949 }, { "epoch": 0.8853681267474371, "grad_norm": 0.1580127483710158, "learning_rate": 1.566448049706593e-05, "loss": 0.4948, "step": 950 }, { "epoch": 0.8863000931966449, "grad_norm": 0.16320246708253766, "learning_rate": 1.5657576803589923e-05, "loss": 0.4881, "step": 951 }, { "epoch": 0.8872320596458527, "grad_norm": 0.15450437284014934, "learning_rate": 1.5650673110113912e-05, "loss": 0.478, "step": 952 }, { "epoch": 0.8881640260950606, "grad_norm": 0.22664991863275505, "learning_rate": 1.56437694166379e-05, "loss": 0.5429, "step": 953 }, { "epoch": 0.8890959925442684, "grad_norm": 0.17183387882862025, "learning_rate": 1.5636865723161894e-05, "loss": 0.5036, "step": 954 }, { "epoch": 0.8900279589934762, "grad_norm": 0.16463861475823738, "learning_rate": 1.5629962029685883e-05, "loss": 0.5255, "step": 955 }, { "epoch": 0.8909599254426841, "grad_norm": 0.19199040042341928, "learning_rate": 1.5623058336209872e-05, "loss": 0.5033, "step": 956 }, { "epoch": 0.8918918918918919, "grad_norm": 0.16722363402612606, "learning_rate": 1.5616154642733864e-05, "loss": 0.533, "step": 957 }, { "epoch": 0.8928238583410997, "grad_norm": 0.1791030113441324, "learning_rate": 1.5609250949257853e-05, "loss": 0.4956, "step": 958 }, { "epoch": 0.8937558247903076, "grad_norm": 0.15805587798013906, "learning_rate": 1.5602347255781846e-05, "loss": 0.4675, "step": 959 }, { "epoch": 0.8946877912395154, "grad_norm": 0.16688526915338095, "learning_rate": 1.5595443562305835e-05, "loss": 0.4893, "step": 960 }, { "epoch": 0.8956197576887233, "grad_norm": 0.29506568332831334, "learning_rate": 1.5588539868829827e-05, "loss": 0.5102, "step": 961 }, { "epoch": 0.896551724137931, "grad_norm": 0.15824077284701793, "learning_rate": 1.5581636175353816e-05, "loss": 0.4725, "step": 962 }, { "epoch": 0.8974836905871388, "grad_norm": 0.18389791331341215, "learning_rate": 1.5574732481877805e-05, "loss": 0.4963, "step": 963 }, { "epoch": 0.8984156570363467, "grad_norm": 0.17619425633955044, "learning_rate": 1.5567828788401794e-05, "loss": 0.4657, "step": 964 }, { "epoch": 0.8993476234855545, "grad_norm": 0.2077472076193503, "learning_rate": 1.5560925094925787e-05, "loss": 0.5017, "step": 965 }, { "epoch": 0.9002795899347623, "grad_norm": 0.1727372015514035, "learning_rate": 1.5554021401449776e-05, "loss": 0.484, "step": 966 }, { "epoch": 0.9012115563839702, "grad_norm": 0.17661921539004674, "learning_rate": 1.5547117707973768e-05, "loss": 0.4968, "step": 967 }, { "epoch": 0.902143522833178, "grad_norm": 0.17379613088157828, "learning_rate": 1.5540214014497757e-05, "loss": 0.4875, "step": 968 }, { "epoch": 0.9030754892823858, "grad_norm": 0.16192107480663268, "learning_rate": 1.553331032102175e-05, "loss": 0.4915, "step": 969 }, { "epoch": 0.9040074557315937, "grad_norm": 0.1762233954963268, "learning_rate": 1.552640662754574e-05, "loss": 0.4665, "step": 970 }, { "epoch": 0.9049394221808015, "grad_norm": 0.17071909941976174, "learning_rate": 1.5519502934069728e-05, "loss": 0.4917, "step": 971 }, { "epoch": 0.9058713886300093, "grad_norm": 0.1619499235719941, "learning_rate": 1.551259924059372e-05, "loss": 0.5017, "step": 972 }, { "epoch": 0.9068033550792172, "grad_norm": 0.18407049908071327, "learning_rate": 1.550569554711771e-05, "loss": 0.5237, "step": 973 }, { "epoch": 0.907735321528425, "grad_norm": 0.17046796578081852, "learning_rate": 1.54987918536417e-05, "loss": 0.5163, "step": 974 }, { "epoch": 0.9086672879776329, "grad_norm": 0.17232072605583054, "learning_rate": 1.5491888160165687e-05, "loss": 0.5082, "step": 975 }, { "epoch": 0.9095992544268406, "grad_norm": 0.15317284569236708, "learning_rate": 1.548498446668968e-05, "loss": 0.4808, "step": 976 }, { "epoch": 0.9105312208760484, "grad_norm": 0.17726778224589287, "learning_rate": 1.5478080773213672e-05, "loss": 0.491, "step": 977 }, { "epoch": 0.9114631873252563, "grad_norm": 0.1822181168683729, "learning_rate": 1.547117707973766e-05, "loss": 0.4847, "step": 978 }, { "epoch": 0.9123951537744641, "grad_norm": 0.1647814317048035, "learning_rate": 1.5464273386261654e-05, "loss": 0.4983, "step": 979 }, { "epoch": 0.9133271202236719, "grad_norm": 0.19757283936272868, "learning_rate": 1.5457369692785643e-05, "loss": 0.5141, "step": 980 }, { "epoch": 0.9142590866728798, "grad_norm": 0.170019509624183, "learning_rate": 1.5450465999309632e-05, "loss": 0.4914, "step": 981 }, { "epoch": 0.9151910531220876, "grad_norm": 0.17797829934828954, "learning_rate": 1.544356230583362e-05, "loss": 0.4773, "step": 982 }, { "epoch": 0.9161230195712954, "grad_norm": 0.16121023732257728, "learning_rate": 1.5436658612357613e-05, "loss": 0.501, "step": 983 }, { "epoch": 0.9170549860205033, "grad_norm": 0.18373534219853005, "learning_rate": 1.5429754918881602e-05, "loss": 0.4915, "step": 984 }, { "epoch": 0.9179869524697111, "grad_norm": 0.1736940213777265, "learning_rate": 1.542285122540559e-05, "loss": 0.5101, "step": 985 }, { "epoch": 0.918918918918919, "grad_norm": 0.1581325566425397, "learning_rate": 1.5415947531929584e-05, "loss": 0.4737, "step": 986 }, { "epoch": 0.9198508853681268, "grad_norm": 0.1805769186320522, "learning_rate": 1.5409043838453576e-05, "loss": 0.4974, "step": 987 }, { "epoch": 0.9207828518173345, "grad_norm": 0.16227069223629229, "learning_rate": 1.5402140144977565e-05, "loss": 0.5157, "step": 988 }, { "epoch": 0.9217148182665424, "grad_norm": 0.17666282212204265, "learning_rate": 1.5395236451501554e-05, "loss": 0.4758, "step": 989 }, { "epoch": 0.9226467847157502, "grad_norm": 0.18188525454217483, "learning_rate": 1.5388332758025547e-05, "loss": 0.4958, "step": 990 }, { "epoch": 0.923578751164958, "grad_norm": 0.17357444529400928, "learning_rate": 1.5381429064549536e-05, "loss": 0.4794, "step": 991 }, { "epoch": 0.9245107176141659, "grad_norm": 0.16746895808652862, "learning_rate": 1.5374525371073525e-05, "loss": 0.4865, "step": 992 }, { "epoch": 0.9254426840633737, "grad_norm": 0.15014207564165358, "learning_rate": 1.5367621677597514e-05, "loss": 0.456, "step": 993 }, { "epoch": 0.9263746505125815, "grad_norm": 0.21259699448371866, "learning_rate": 1.5360717984121506e-05, "loss": 0.5241, "step": 994 }, { "epoch": 0.9273066169617894, "grad_norm": 0.15898571752674343, "learning_rate": 1.5353814290645495e-05, "loss": 0.4854, "step": 995 }, { "epoch": 0.9282385834109972, "grad_norm": 0.18682987317188812, "learning_rate": 1.5346910597169488e-05, "loss": 0.5006, "step": 996 }, { "epoch": 0.9291705498602051, "grad_norm": 0.17389598691948024, "learning_rate": 1.5340006903693477e-05, "loss": 0.4819, "step": 997 }, { "epoch": 0.9301025163094129, "grad_norm": 0.15789447361009107, "learning_rate": 1.533310321021747e-05, "loss": 0.4827, "step": 998 }, { "epoch": 0.9310344827586207, "grad_norm": 0.17712666599085813, "learning_rate": 1.5326199516741458e-05, "loss": 0.4975, "step": 999 }, { "epoch": 0.9319664492078286, "grad_norm": 0.18742524076133502, "learning_rate": 1.5319295823265447e-05, "loss": 0.4892, "step": 1000 }, { "epoch": 0.9328984156570364, "grad_norm": 0.15632704493043525, "learning_rate": 1.531239212978944e-05, "loss": 0.4795, "step": 1001 }, { "epoch": 0.9338303821062441, "grad_norm": 0.1667720331475852, "learning_rate": 1.530548843631343e-05, "loss": 0.4975, "step": 1002 }, { "epoch": 0.934762348555452, "grad_norm": 0.17554626234523216, "learning_rate": 1.5298584742837418e-05, "loss": 0.495, "step": 1003 }, { "epoch": 0.9356943150046598, "grad_norm": 0.19345434508779147, "learning_rate": 1.529168104936141e-05, "loss": 0.5051, "step": 1004 }, { "epoch": 0.9366262814538676, "grad_norm": 0.16153232741669535, "learning_rate": 1.52847773558854e-05, "loss": 0.5002, "step": 1005 }, { "epoch": 0.9375582479030755, "grad_norm": 0.17648391283981082, "learning_rate": 1.527787366240939e-05, "loss": 0.4966, "step": 1006 }, { "epoch": 0.9384902143522833, "grad_norm": 0.18143972589982368, "learning_rate": 1.527096996893338e-05, "loss": 0.4838, "step": 1007 }, { "epoch": 0.9394221808014911, "grad_norm": 0.16510218251623035, "learning_rate": 1.526406627545737e-05, "loss": 0.4564, "step": 1008 }, { "epoch": 0.940354147250699, "grad_norm": 0.19171333555684866, "learning_rate": 1.5257162581981362e-05, "loss": 0.5134, "step": 1009 }, { "epoch": 0.9412861136999068, "grad_norm": 0.15599924892010741, "learning_rate": 1.5250258888505351e-05, "loss": 0.4951, "step": 1010 }, { "epoch": 0.9422180801491147, "grad_norm": 0.18703610793659578, "learning_rate": 1.5243355195029342e-05, "loss": 0.5108, "step": 1011 }, { "epoch": 0.9431500465983225, "grad_norm": 0.1721542214825951, "learning_rate": 1.5236451501553333e-05, "loss": 0.4986, "step": 1012 }, { "epoch": 0.9440820130475303, "grad_norm": 0.18672295308067718, "learning_rate": 1.5229547808077323e-05, "loss": 0.4876, "step": 1013 }, { "epoch": 0.9450139794967382, "grad_norm": 0.1613506317308374, "learning_rate": 1.5222644114601312e-05, "loss": 0.4835, "step": 1014 }, { "epoch": 0.9459459459459459, "grad_norm": 0.19683057008688465, "learning_rate": 1.5215740421125303e-05, "loss": 0.5113, "step": 1015 }, { "epoch": 0.9468779123951537, "grad_norm": 0.1635176271577811, "learning_rate": 1.5208836727649294e-05, "loss": 0.4875, "step": 1016 }, { "epoch": 0.9478098788443616, "grad_norm": 0.16916823104166762, "learning_rate": 1.5201933034173285e-05, "loss": 0.4816, "step": 1017 }, { "epoch": 0.9487418452935694, "grad_norm": 0.17276254749840417, "learning_rate": 1.5195029340697274e-05, "loss": 0.5035, "step": 1018 }, { "epoch": 0.9496738117427772, "grad_norm": 0.2003825675941556, "learning_rate": 1.5188125647221266e-05, "loss": 0.5035, "step": 1019 }, { "epoch": 0.9506057781919851, "grad_norm": 0.1705005676366751, "learning_rate": 1.5181221953745255e-05, "loss": 0.4743, "step": 1020 }, { "epoch": 0.9515377446411929, "grad_norm": 0.17342354086209583, "learning_rate": 1.5174318260269246e-05, "loss": 0.4954, "step": 1021 }, { "epoch": 0.9524697110904008, "grad_norm": 0.1731479073742357, "learning_rate": 1.5167414566793235e-05, "loss": 0.4974, "step": 1022 }, { "epoch": 0.9534016775396086, "grad_norm": 0.1734896826825382, "learning_rate": 1.5160510873317227e-05, "loss": 0.4823, "step": 1023 }, { "epoch": 0.9543336439888164, "grad_norm": 0.1770315415766613, "learning_rate": 1.5153607179841216e-05, "loss": 0.4969, "step": 1024 }, { "epoch": 0.9552656104380243, "grad_norm": 0.17700015576262967, "learning_rate": 1.5146703486365207e-05, "loss": 0.5022, "step": 1025 }, { "epoch": 0.9561975768872321, "grad_norm": 0.17059680632079674, "learning_rate": 1.5139799792889196e-05, "loss": 0.4953, "step": 1026 }, { "epoch": 0.9571295433364398, "grad_norm": 0.2050071774969081, "learning_rate": 1.5132896099413189e-05, "loss": 0.5, "step": 1027 }, { "epoch": 0.9580615097856477, "grad_norm": 0.23339078120459059, "learning_rate": 1.5125992405937178e-05, "loss": 0.5398, "step": 1028 }, { "epoch": 0.9589934762348555, "grad_norm": 0.16891106938017797, "learning_rate": 1.5119088712461167e-05, "loss": 0.5008, "step": 1029 }, { "epoch": 0.9599254426840633, "grad_norm": 0.15821721391076182, "learning_rate": 1.5112185018985159e-05, "loss": 0.4835, "step": 1030 }, { "epoch": 0.9608574091332712, "grad_norm": 0.17668127682669063, "learning_rate": 1.510528132550915e-05, "loss": 0.5185, "step": 1031 }, { "epoch": 0.961789375582479, "grad_norm": 0.1554513836275494, "learning_rate": 1.5098377632033139e-05, "loss": 0.48, "step": 1032 }, { "epoch": 0.9627213420316869, "grad_norm": 0.1551682831876922, "learning_rate": 1.5091473938557128e-05, "loss": 0.5015, "step": 1033 }, { "epoch": 0.9636533084808947, "grad_norm": 0.15725366399716578, "learning_rate": 1.508457024508112e-05, "loss": 0.4651, "step": 1034 }, { "epoch": 0.9645852749301025, "grad_norm": 0.15873530688485796, "learning_rate": 1.5077666551605111e-05, "loss": 0.487, "step": 1035 }, { "epoch": 0.9655172413793104, "grad_norm": 0.1681195755386051, "learning_rate": 1.50707628581291e-05, "loss": 0.4773, "step": 1036 }, { "epoch": 0.9664492078285182, "grad_norm": 0.16421516318730917, "learning_rate": 1.5063859164653089e-05, "loss": 0.491, "step": 1037 }, { "epoch": 0.967381174277726, "grad_norm": 0.15756466088368906, "learning_rate": 1.5056955471177081e-05, "loss": 0.4881, "step": 1038 }, { "epoch": 0.9683131407269339, "grad_norm": 0.16076351336173722, "learning_rate": 1.505005177770107e-05, "loss": 0.5244, "step": 1039 }, { "epoch": 0.9692451071761417, "grad_norm": 0.16569430868481608, "learning_rate": 1.5043148084225061e-05, "loss": 0.4643, "step": 1040 }, { "epoch": 0.9701770736253494, "grad_norm": 0.1655321277190243, "learning_rate": 1.5036244390749054e-05, "loss": 0.4775, "step": 1041 }, { "epoch": 0.9711090400745573, "grad_norm": 0.14710450054412605, "learning_rate": 1.5029340697273043e-05, "loss": 0.4844, "step": 1042 }, { "epoch": 0.9720410065237651, "grad_norm": 0.1508165808985877, "learning_rate": 1.5022437003797032e-05, "loss": 0.4662, "step": 1043 }, { "epoch": 0.972972972972973, "grad_norm": 0.15373804038740876, "learning_rate": 1.5015533310321023e-05, "loss": 0.4716, "step": 1044 }, { "epoch": 0.9739049394221808, "grad_norm": 0.16409028474982257, "learning_rate": 1.5008629616845013e-05, "loss": 0.4913, "step": 1045 }, { "epoch": 0.9748369058713886, "grad_norm": 0.1681708519935129, "learning_rate": 1.5001725923369004e-05, "loss": 0.5228, "step": 1046 }, { "epoch": 0.9757688723205965, "grad_norm": 0.173214041221056, "learning_rate": 1.4994822229892993e-05, "loss": 0.5323, "step": 1047 }, { "epoch": 0.9767008387698043, "grad_norm": 0.14855318449061172, "learning_rate": 1.4987918536416984e-05, "loss": 0.4842, "step": 1048 }, { "epoch": 0.9776328052190121, "grad_norm": 0.17414254437356053, "learning_rate": 1.4981014842940974e-05, "loss": 0.5128, "step": 1049 }, { "epoch": 0.97856477166822, "grad_norm": 0.1871367454187071, "learning_rate": 1.4974111149464965e-05, "loss": 0.5229, "step": 1050 }, { "epoch": 0.9794967381174278, "grad_norm": 0.14836868684398377, "learning_rate": 1.4967207455988954e-05, "loss": 0.4624, "step": 1051 }, { "epoch": 0.9804287045666356, "grad_norm": 0.16271420420564642, "learning_rate": 1.4960303762512947e-05, "loss": 0.4984, "step": 1052 }, { "epoch": 0.9813606710158435, "grad_norm": 0.1824077068141641, "learning_rate": 1.4953400069036936e-05, "loss": 0.4875, "step": 1053 }, { "epoch": 0.9822926374650512, "grad_norm": 0.16074043305042848, "learning_rate": 1.4946496375560926e-05, "loss": 0.4812, "step": 1054 }, { "epoch": 0.983224603914259, "grad_norm": 0.16195683069321407, "learning_rate": 1.4939592682084915e-05, "loss": 0.5124, "step": 1055 }, { "epoch": 0.9841565703634669, "grad_norm": 0.18970239768665476, "learning_rate": 1.4932688988608908e-05, "loss": 0.4945, "step": 1056 }, { "epoch": 0.9850885368126747, "grad_norm": 0.1707264072348347, "learning_rate": 1.4925785295132897e-05, "loss": 0.5007, "step": 1057 }, { "epoch": 0.9860205032618826, "grad_norm": 0.14916448376065808, "learning_rate": 1.4918881601656888e-05, "loss": 0.4784, "step": 1058 }, { "epoch": 0.9869524697110904, "grad_norm": 0.17747391822272127, "learning_rate": 1.4911977908180878e-05, "loss": 0.5, "step": 1059 }, { "epoch": 0.9878844361602982, "grad_norm": 0.1910663360662259, "learning_rate": 1.4905074214704869e-05, "loss": 0.4625, "step": 1060 }, { "epoch": 0.9888164026095061, "grad_norm": 0.17390734814222733, "learning_rate": 1.4898170521228858e-05, "loss": 0.4908, "step": 1061 }, { "epoch": 0.9897483690587139, "grad_norm": 0.1623967987661761, "learning_rate": 1.4891266827752849e-05, "loss": 0.4817, "step": 1062 }, { "epoch": 0.9906803355079217, "grad_norm": 0.2111921577770382, "learning_rate": 1.488436313427684e-05, "loss": 0.5069, "step": 1063 }, { "epoch": 0.9916123019571296, "grad_norm": 0.1645453614126383, "learning_rate": 1.487745944080083e-05, "loss": 0.479, "step": 1064 }, { "epoch": 0.9925442684063374, "grad_norm": 0.16037145901255354, "learning_rate": 1.487055574732482e-05, "loss": 0.4987, "step": 1065 }, { "epoch": 0.9934762348555451, "grad_norm": 0.17902283460666238, "learning_rate": 1.486365205384881e-05, "loss": 0.5215, "step": 1066 }, { "epoch": 0.994408201304753, "grad_norm": 0.16360427131475774, "learning_rate": 1.4856748360372801e-05, "loss": 0.4686, "step": 1067 }, { "epoch": 0.9953401677539608, "grad_norm": 0.2252246151059562, "learning_rate": 1.4849844666896792e-05, "loss": 0.5142, "step": 1068 }, { "epoch": 0.9962721342031687, "grad_norm": 0.16473073305938538, "learning_rate": 1.484294097342078e-05, "loss": 0.4717, "step": 1069 }, { "epoch": 0.9972041006523765, "grad_norm": 0.1787683038348436, "learning_rate": 1.4836037279944773e-05, "loss": 0.4937, "step": 1070 }, { "epoch": 0.9981360671015843, "grad_norm": 0.1503075480080574, "learning_rate": 1.4829133586468762e-05, "loss": 0.4681, "step": 1071 }, { "epoch": 0.9990680335507922, "grad_norm": 0.15445729999258503, "learning_rate": 1.4822229892992753e-05, "loss": 0.4874, "step": 1072 }, { "epoch": 1.0, "grad_norm": 0.16770945748443036, "learning_rate": 1.4815326199516742e-05, "loss": 0.4733, "step": 1073 }, { "epoch": 1.0009319664492078, "grad_norm": 0.1468184542126245, "learning_rate": 1.4808422506040734e-05, "loss": 0.4391, "step": 1074 }, { "epoch": 1.0018639328984156, "grad_norm": 0.15864600639282234, "learning_rate": 1.4801518812564723e-05, "loss": 0.464, "step": 1075 }, { "epoch": 1.0027958993476236, "grad_norm": 0.14728826571165715, "learning_rate": 1.4794615119088714e-05, "loss": 0.4388, "step": 1076 }, { "epoch": 1.0037278657968314, "grad_norm": 0.16426269712407487, "learning_rate": 1.4787711425612703e-05, "loss": 0.486, "step": 1077 }, { "epoch": 1.0046598322460392, "grad_norm": 0.16993212333288074, "learning_rate": 1.4780807732136696e-05, "loss": 0.4855, "step": 1078 }, { "epoch": 1.005591798695247, "grad_norm": 0.15407807036337806, "learning_rate": 1.4773904038660685e-05, "loss": 0.4709, "step": 1079 }, { "epoch": 1.0065237651444547, "grad_norm": 0.14850441881799212, "learning_rate": 1.4767000345184674e-05, "loss": 0.4801, "step": 1080 }, { "epoch": 1.0074557315936625, "grad_norm": 0.15374080110317256, "learning_rate": 1.4760096651708666e-05, "loss": 0.4346, "step": 1081 }, { "epoch": 1.0083876980428705, "grad_norm": 0.19698343567079876, "learning_rate": 1.4753192958232657e-05, "loss": 0.4749, "step": 1082 }, { "epoch": 1.0093196644920783, "grad_norm": 0.15561480230037425, "learning_rate": 1.4746289264756646e-05, "loss": 0.4834, "step": 1083 }, { "epoch": 1.0102516309412861, "grad_norm": 0.15760663288055266, "learning_rate": 1.4739385571280635e-05, "loss": 0.4746, "step": 1084 }, { "epoch": 1.011183597390494, "grad_norm": 0.17815278184495628, "learning_rate": 1.4732481877804627e-05, "loss": 0.5108, "step": 1085 }, { "epoch": 1.0121155638397017, "grad_norm": 0.16414290120760483, "learning_rate": 1.4725578184328618e-05, "loss": 0.4496, "step": 1086 }, { "epoch": 1.0130475302889097, "grad_norm": 0.25995914844910334, "learning_rate": 1.4718674490852607e-05, "loss": 0.5064, "step": 1087 }, { "epoch": 1.0139794967381175, "grad_norm": 0.17175540146223198, "learning_rate": 1.4711770797376596e-05, "loss": 0.511, "step": 1088 }, { "epoch": 1.0149114631873253, "grad_norm": 0.17262283456934374, "learning_rate": 1.4704867103900589e-05, "loss": 0.5135, "step": 1089 }, { "epoch": 1.015843429636533, "grad_norm": 0.15637043380253568, "learning_rate": 1.4697963410424578e-05, "loss": 0.4838, "step": 1090 }, { "epoch": 1.0167753960857409, "grad_norm": 0.17437832575878368, "learning_rate": 1.4691059716948568e-05, "loss": 0.455, "step": 1091 }, { "epoch": 1.0177073625349486, "grad_norm": 0.16884346697280625, "learning_rate": 1.468415602347256e-05, "loss": 0.4826, "step": 1092 }, { "epoch": 1.0186393289841567, "grad_norm": 0.17972593427312955, "learning_rate": 1.467725232999655e-05, "loss": 0.4735, "step": 1093 }, { "epoch": 1.0195712954333644, "grad_norm": 0.17278617333879873, "learning_rate": 1.4670348636520539e-05, "loss": 0.4808, "step": 1094 }, { "epoch": 1.0205032618825722, "grad_norm": 0.16489080795454372, "learning_rate": 1.466344494304453e-05, "loss": 0.4671, "step": 1095 }, { "epoch": 1.02143522833178, "grad_norm": 0.15986025667834905, "learning_rate": 1.465654124956852e-05, "loss": 0.4482, "step": 1096 }, { "epoch": 1.0223671947809878, "grad_norm": 0.14528679426580832, "learning_rate": 1.4649637556092511e-05, "loss": 0.4412, "step": 1097 }, { "epoch": 1.0232991612301958, "grad_norm": 0.17848701195028346, "learning_rate": 1.46427338626165e-05, "loss": 0.4737, "step": 1098 }, { "epoch": 1.0242311276794036, "grad_norm": 0.1552901639640641, "learning_rate": 1.463583016914049e-05, "loss": 0.458, "step": 1099 }, { "epoch": 1.0251630941286114, "grad_norm": 0.15844804756645997, "learning_rate": 1.4628926475664481e-05, "loss": 0.4765, "step": 1100 }, { "epoch": 1.0260950605778192, "grad_norm": 0.16211155498518343, "learning_rate": 1.4622022782188472e-05, "loss": 0.4623, "step": 1101 }, { "epoch": 1.027027027027027, "grad_norm": 0.16963133487227136, "learning_rate": 1.4615119088712461e-05, "loss": 0.4699, "step": 1102 }, { "epoch": 1.0279589934762348, "grad_norm": 0.1616795455875387, "learning_rate": 1.4608215395236454e-05, "loss": 0.4909, "step": 1103 }, { "epoch": 1.0288909599254428, "grad_norm": 0.18381232507137868, "learning_rate": 1.4601311701760443e-05, "loss": 0.4602, "step": 1104 }, { "epoch": 1.0298229263746506, "grad_norm": 0.16311685553604605, "learning_rate": 1.4594408008284433e-05, "loss": 0.4666, "step": 1105 }, { "epoch": 1.0307548928238583, "grad_norm": 0.17180030735749666, "learning_rate": 1.4587504314808423e-05, "loss": 0.4794, "step": 1106 }, { "epoch": 1.0316868592730661, "grad_norm": 0.16608013190166954, "learning_rate": 1.4580600621332415e-05, "loss": 0.4523, "step": 1107 }, { "epoch": 1.032618825722274, "grad_norm": 0.16664721123811754, "learning_rate": 1.4573696927856404e-05, "loss": 0.477, "step": 1108 }, { "epoch": 1.0335507921714817, "grad_norm": 0.16356802972406442, "learning_rate": 1.4566793234380395e-05, "loss": 0.4652, "step": 1109 }, { "epoch": 1.0344827586206897, "grad_norm": 0.15459166152472137, "learning_rate": 1.4559889540904385e-05, "loss": 0.4581, "step": 1110 }, { "epoch": 1.0354147250698975, "grad_norm": 0.15613159675506966, "learning_rate": 1.4552985847428376e-05, "loss": 0.47, "step": 1111 }, { "epoch": 1.0363466915191053, "grad_norm": 0.15745609102122263, "learning_rate": 1.4546082153952365e-05, "loss": 0.4509, "step": 1112 }, { "epoch": 1.037278657968313, "grad_norm": 0.1747874908270434, "learning_rate": 1.4539178460476356e-05, "loss": 0.477, "step": 1113 }, { "epoch": 1.0382106244175209, "grad_norm": 0.16241465175234987, "learning_rate": 1.4532274767000347e-05, "loss": 0.4697, "step": 1114 }, { "epoch": 1.0391425908667289, "grad_norm": 0.1556235611649426, "learning_rate": 1.4525371073524337e-05, "loss": 0.488, "step": 1115 }, { "epoch": 1.0400745573159367, "grad_norm": 0.14440931941859522, "learning_rate": 1.4518467380048326e-05, "loss": 0.4442, "step": 1116 }, { "epoch": 1.0410065237651445, "grad_norm": 0.18151853247843847, "learning_rate": 1.4511563686572317e-05, "loss": 0.4889, "step": 1117 }, { "epoch": 1.0419384902143523, "grad_norm": 0.1560385484818199, "learning_rate": 1.4504659993096308e-05, "loss": 0.4554, "step": 1118 }, { "epoch": 1.04287045666356, "grad_norm": 0.16771848848036314, "learning_rate": 1.4497756299620299e-05, "loss": 0.4934, "step": 1119 }, { "epoch": 1.0438024231127678, "grad_norm": 0.15967977361460076, "learning_rate": 1.4490852606144288e-05, "loss": 0.4704, "step": 1120 }, { "epoch": 1.0447343895619758, "grad_norm": 0.3281672389516634, "learning_rate": 1.448394891266828e-05, "loss": 0.4634, "step": 1121 }, { "epoch": 1.0456663560111836, "grad_norm": 0.16763859823589672, "learning_rate": 1.4477045219192269e-05, "loss": 0.4799, "step": 1122 }, { "epoch": 1.0465983224603914, "grad_norm": 0.15545394487049777, "learning_rate": 1.447014152571626e-05, "loss": 0.4657, "step": 1123 }, { "epoch": 1.0475302889095992, "grad_norm": 0.18944673800031447, "learning_rate": 1.4463237832240249e-05, "loss": 0.4986, "step": 1124 }, { "epoch": 1.048462255358807, "grad_norm": 0.14763365662074454, "learning_rate": 1.4456334138764241e-05, "loss": 0.4612, "step": 1125 }, { "epoch": 1.049394221808015, "grad_norm": 0.160669994237166, "learning_rate": 1.444943044528823e-05, "loss": 0.4659, "step": 1126 }, { "epoch": 1.0503261882572228, "grad_norm": 0.16763424658000436, "learning_rate": 1.4442526751812221e-05, "loss": 0.5015, "step": 1127 }, { "epoch": 1.0512581547064306, "grad_norm": 0.15094990433122324, "learning_rate": 1.443562305833621e-05, "loss": 0.4661, "step": 1128 }, { "epoch": 1.0521901211556384, "grad_norm": 0.15906797609294837, "learning_rate": 1.4428719364860203e-05, "loss": 0.4638, "step": 1129 }, { "epoch": 1.0531220876048462, "grad_norm": 0.15427411926337464, "learning_rate": 1.4421815671384192e-05, "loss": 0.4629, "step": 1130 }, { "epoch": 1.054054054054054, "grad_norm": 0.1753251061708393, "learning_rate": 1.441491197790818e-05, "loss": 0.4892, "step": 1131 }, { "epoch": 1.054986020503262, "grad_norm": 0.19406616176657962, "learning_rate": 1.4408008284432173e-05, "loss": 0.4741, "step": 1132 }, { "epoch": 1.0559179869524697, "grad_norm": 0.15930854996950525, "learning_rate": 1.4401104590956164e-05, "loss": 0.4662, "step": 1133 }, { "epoch": 1.0568499534016775, "grad_norm": 0.16769500474736979, "learning_rate": 1.4394200897480153e-05, "loss": 0.4975, "step": 1134 }, { "epoch": 1.0577819198508853, "grad_norm": 0.16328322889596394, "learning_rate": 1.4387297204004142e-05, "loss": 0.5053, "step": 1135 }, { "epoch": 1.058713886300093, "grad_norm": 0.14666037883804017, "learning_rate": 1.4380393510528134e-05, "loss": 0.4647, "step": 1136 }, { "epoch": 1.0596458527493011, "grad_norm": 0.18775572191910217, "learning_rate": 1.4373489817052125e-05, "loss": 0.4984, "step": 1137 }, { "epoch": 1.060577819198509, "grad_norm": 0.16318519363211667, "learning_rate": 1.4366586123576114e-05, "loss": 0.4742, "step": 1138 }, { "epoch": 1.0615097856477167, "grad_norm": 0.1717740998938448, "learning_rate": 1.4359682430100103e-05, "loss": 0.4742, "step": 1139 }, { "epoch": 1.0624417520969245, "grad_norm": 0.16278109930318635, "learning_rate": 1.4352778736624096e-05, "loss": 0.4867, "step": 1140 }, { "epoch": 1.0633737185461323, "grad_norm": 0.16974212591864674, "learning_rate": 1.4345875043148085e-05, "loss": 0.4553, "step": 1141 }, { "epoch": 1.06430568499534, "grad_norm": 0.1708690476860089, "learning_rate": 1.4338971349672075e-05, "loss": 0.4522, "step": 1142 }, { "epoch": 1.065237651444548, "grad_norm": 0.17198663743724768, "learning_rate": 1.4332067656196068e-05, "loss": 0.4931, "step": 1143 }, { "epoch": 1.0661696178937559, "grad_norm": 0.15323840634603117, "learning_rate": 1.4325163962720057e-05, "loss": 0.465, "step": 1144 }, { "epoch": 1.0671015843429636, "grad_norm": 0.16428458578301328, "learning_rate": 1.4318260269244046e-05, "loss": 0.4825, "step": 1145 }, { "epoch": 1.0680335507921714, "grad_norm": 0.16570275671758508, "learning_rate": 1.4311356575768037e-05, "loss": 0.4736, "step": 1146 }, { "epoch": 1.0689655172413792, "grad_norm": 0.19570161092115076, "learning_rate": 1.4304452882292027e-05, "loss": 0.4705, "step": 1147 }, { "epoch": 1.0698974836905872, "grad_norm": 0.1822374394260547, "learning_rate": 1.4297549188816018e-05, "loss": 0.4562, "step": 1148 }, { "epoch": 1.070829450139795, "grad_norm": 0.16892685872252738, "learning_rate": 1.4290645495340007e-05, "loss": 0.4549, "step": 1149 }, { "epoch": 1.0717614165890028, "grad_norm": 0.15939493402337834, "learning_rate": 1.4283741801864e-05, "loss": 0.4758, "step": 1150 }, { "epoch": 1.0726933830382106, "grad_norm": 0.15155256637341788, "learning_rate": 1.4276838108387989e-05, "loss": 0.4539, "step": 1151 }, { "epoch": 1.0736253494874184, "grad_norm": 0.16491687882027148, "learning_rate": 1.426993441491198e-05, "loss": 0.4813, "step": 1152 }, { "epoch": 1.0745573159366262, "grad_norm": 0.15488919594145073, "learning_rate": 1.4263030721435968e-05, "loss": 0.4885, "step": 1153 }, { "epoch": 1.0754892823858342, "grad_norm": 0.17723341003164628, "learning_rate": 1.425612702795996e-05, "loss": 0.4554, "step": 1154 }, { "epoch": 1.076421248835042, "grad_norm": 0.18907656238814474, "learning_rate": 1.424922333448395e-05, "loss": 0.4894, "step": 1155 }, { "epoch": 1.0773532152842498, "grad_norm": 0.17227932107827792, "learning_rate": 1.424231964100794e-05, "loss": 0.4695, "step": 1156 }, { "epoch": 1.0782851817334576, "grad_norm": 0.15062270278778403, "learning_rate": 1.423541594753193e-05, "loss": 0.4684, "step": 1157 }, { "epoch": 1.0792171481826653, "grad_norm": 0.15091104198119992, "learning_rate": 1.4228512254055922e-05, "loss": 0.4703, "step": 1158 }, { "epoch": 1.0801491146318734, "grad_norm": 0.15789057313612795, "learning_rate": 1.4221608560579911e-05, "loss": 0.4692, "step": 1159 }, { "epoch": 1.0810810810810811, "grad_norm": 0.1491797842977972, "learning_rate": 1.4214704867103902e-05, "loss": 0.458, "step": 1160 }, { "epoch": 1.082013047530289, "grad_norm": 0.15799040367167153, "learning_rate": 1.4207801173627892e-05, "loss": 0.468, "step": 1161 }, { "epoch": 1.0829450139794967, "grad_norm": 0.1716374057691793, "learning_rate": 1.4200897480151883e-05, "loss": 0.4972, "step": 1162 }, { "epoch": 1.0838769804287045, "grad_norm": 0.16188144982407993, "learning_rate": 1.4193993786675872e-05, "loss": 0.5058, "step": 1163 }, { "epoch": 1.0848089468779123, "grad_norm": 0.16406866518263988, "learning_rate": 1.4187090093199863e-05, "loss": 0.4603, "step": 1164 }, { "epoch": 1.0857409133271203, "grad_norm": 0.16415641361501207, "learning_rate": 1.4180186399723854e-05, "loss": 0.4522, "step": 1165 }, { "epoch": 1.086672879776328, "grad_norm": 0.1627372912626162, "learning_rate": 1.4173282706247844e-05, "loss": 0.4712, "step": 1166 }, { "epoch": 1.0876048462255359, "grad_norm": 0.15985413155373665, "learning_rate": 1.4166379012771833e-05, "loss": 0.4885, "step": 1167 }, { "epoch": 1.0885368126747437, "grad_norm": 0.16205898209697275, "learning_rate": 1.4159475319295824e-05, "loss": 0.4639, "step": 1168 }, { "epoch": 1.0894687791239515, "grad_norm": 0.17876065748696585, "learning_rate": 1.4152571625819815e-05, "loss": 0.5271, "step": 1169 }, { "epoch": 1.0904007455731595, "grad_norm": 0.16402286183345705, "learning_rate": 1.4145667932343806e-05, "loss": 0.48, "step": 1170 }, { "epoch": 1.0913327120223673, "grad_norm": 0.1798259267309379, "learning_rate": 1.4138764238867795e-05, "loss": 0.4941, "step": 1171 }, { "epoch": 1.092264678471575, "grad_norm": 0.16357867304128848, "learning_rate": 1.4131860545391787e-05, "loss": 0.4668, "step": 1172 }, { "epoch": 1.0931966449207828, "grad_norm": 0.16168654847816719, "learning_rate": 1.4124956851915776e-05, "loss": 0.4604, "step": 1173 }, { "epoch": 1.0941286113699906, "grad_norm": 0.1585848111840014, "learning_rate": 1.4118053158439767e-05, "loss": 0.4445, "step": 1174 }, { "epoch": 1.0950605778191984, "grad_norm": 0.16044434818394307, "learning_rate": 1.4111149464963756e-05, "loss": 0.4742, "step": 1175 }, { "epoch": 1.0959925442684064, "grad_norm": 0.16357607392725254, "learning_rate": 1.4104245771487748e-05, "loss": 0.4741, "step": 1176 }, { "epoch": 1.0969245107176142, "grad_norm": 0.1623552909192372, "learning_rate": 1.4097342078011737e-05, "loss": 0.4831, "step": 1177 }, { "epoch": 1.097856477166822, "grad_norm": 0.17468283186615818, "learning_rate": 1.4090438384535728e-05, "loss": 0.4696, "step": 1178 }, { "epoch": 1.0987884436160298, "grad_norm": 0.15355567563255088, "learning_rate": 1.4083534691059717e-05, "loss": 0.4625, "step": 1179 }, { "epoch": 1.0997204100652376, "grad_norm": 0.16170039424512875, "learning_rate": 1.407663099758371e-05, "loss": 0.482, "step": 1180 }, { "epoch": 1.1006523765144456, "grad_norm": 0.18365836587919943, "learning_rate": 1.4069727304107699e-05, "loss": 0.4647, "step": 1181 }, { "epoch": 1.1015843429636534, "grad_norm": 0.1658419723403469, "learning_rate": 1.4062823610631688e-05, "loss": 0.4741, "step": 1182 }, { "epoch": 1.1025163094128612, "grad_norm": 0.13765393101305246, "learning_rate": 1.405591991715568e-05, "loss": 0.4465, "step": 1183 }, { "epoch": 1.103448275862069, "grad_norm": 0.16049794965821798, "learning_rate": 1.404901622367967e-05, "loss": 0.4867, "step": 1184 }, { "epoch": 1.1043802423112767, "grad_norm": 0.17003233819320288, "learning_rate": 1.404211253020366e-05, "loss": 0.4777, "step": 1185 }, { "epoch": 1.1053122087604845, "grad_norm": 0.24753619584015515, "learning_rate": 1.4035208836727649e-05, "loss": 0.5007, "step": 1186 }, { "epoch": 1.1062441752096925, "grad_norm": 0.1448482536722156, "learning_rate": 1.4028305143251641e-05, "loss": 0.4731, "step": 1187 }, { "epoch": 1.1071761416589003, "grad_norm": 0.16333364797022962, "learning_rate": 1.402140144977563e-05, "loss": 0.4861, "step": 1188 }, { "epoch": 1.1081081081081081, "grad_norm": 0.14846484293141535, "learning_rate": 1.4014497756299621e-05, "loss": 0.4518, "step": 1189 }, { "epoch": 1.109040074557316, "grad_norm": 0.15032614686794954, "learning_rate": 1.400759406282361e-05, "loss": 0.4584, "step": 1190 }, { "epoch": 1.1099720410065237, "grad_norm": 0.16450713392449412, "learning_rate": 1.4000690369347603e-05, "loss": 0.4789, "step": 1191 }, { "epoch": 1.1109040074557317, "grad_norm": 0.14890064438632628, "learning_rate": 1.3993786675871592e-05, "loss": 0.4735, "step": 1192 }, { "epoch": 1.1118359739049395, "grad_norm": 0.15704286930483496, "learning_rate": 1.3986882982395582e-05, "loss": 0.4723, "step": 1193 }, { "epoch": 1.1127679403541473, "grad_norm": 0.18242743868017428, "learning_rate": 1.3979979288919575e-05, "loss": 0.4884, "step": 1194 }, { "epoch": 1.113699906803355, "grad_norm": 0.15841934811192884, "learning_rate": 1.3973075595443564e-05, "loss": 0.5001, "step": 1195 }, { "epoch": 1.1146318732525629, "grad_norm": 0.16585333174175548, "learning_rate": 1.3966171901967553e-05, "loss": 0.4787, "step": 1196 }, { "epoch": 1.1155638397017706, "grad_norm": 0.1879937322902736, "learning_rate": 1.3959268208491544e-05, "loss": 0.4805, "step": 1197 }, { "epoch": 1.1164958061509787, "grad_norm": 0.14513223346517837, "learning_rate": 1.3952364515015534e-05, "loss": 0.4673, "step": 1198 }, { "epoch": 1.1174277726001864, "grad_norm": 0.16237333299841364, "learning_rate": 1.3945460821539525e-05, "loss": 0.4772, "step": 1199 }, { "epoch": 1.1183597390493942, "grad_norm": 0.17627138755620256, "learning_rate": 1.3938557128063514e-05, "loss": 0.4596, "step": 1200 }, { "epoch": 1.119291705498602, "grad_norm": 0.16382332837226446, "learning_rate": 1.3931653434587507e-05, "loss": 0.4957, "step": 1201 }, { "epoch": 1.1202236719478098, "grad_norm": 0.16009308890224688, "learning_rate": 1.3924749741111496e-05, "loss": 0.4469, "step": 1202 }, { "epoch": 1.1211556383970178, "grad_norm": 0.16792533131272122, "learning_rate": 1.3917846047635486e-05, "loss": 0.4586, "step": 1203 }, { "epoch": 1.1220876048462256, "grad_norm": 0.1514298669748293, "learning_rate": 1.3910942354159475e-05, "loss": 0.4632, "step": 1204 }, { "epoch": 1.1230195712954334, "grad_norm": 0.15943793730539857, "learning_rate": 1.3904038660683468e-05, "loss": 0.4856, "step": 1205 }, { "epoch": 1.1239515377446412, "grad_norm": 0.16097656647245734, "learning_rate": 1.3897134967207457e-05, "loss": 0.4551, "step": 1206 }, { "epoch": 1.124883504193849, "grad_norm": 0.15605143536428015, "learning_rate": 1.3890231273731448e-05, "loss": 0.4782, "step": 1207 }, { "epoch": 1.1258154706430568, "grad_norm": 0.16197745186152396, "learning_rate": 1.3883327580255437e-05, "loss": 0.4701, "step": 1208 }, { "epoch": 1.1267474370922648, "grad_norm": 0.15945179129894105, "learning_rate": 1.3876423886779429e-05, "loss": 0.4518, "step": 1209 }, { "epoch": 1.1276794035414726, "grad_norm": 0.1485516793393698, "learning_rate": 1.3869520193303418e-05, "loss": 0.469, "step": 1210 }, { "epoch": 1.1286113699906803, "grad_norm": 0.16716334932857616, "learning_rate": 1.3862616499827409e-05, "loss": 0.477, "step": 1211 }, { "epoch": 1.1295433364398881, "grad_norm": 0.15894981555512908, "learning_rate": 1.38557128063514e-05, "loss": 0.4659, "step": 1212 }, { "epoch": 1.130475302889096, "grad_norm": 0.1668194111985826, "learning_rate": 1.384880911287539e-05, "loss": 0.4966, "step": 1213 }, { "epoch": 1.131407269338304, "grad_norm": 0.15264792218428494, "learning_rate": 1.384190541939938e-05, "loss": 0.4775, "step": 1214 }, { "epoch": 1.1323392357875117, "grad_norm": 0.16190422811863736, "learning_rate": 1.383500172592337e-05, "loss": 0.4621, "step": 1215 }, { "epoch": 1.1332712022367195, "grad_norm": 0.15014579139609108, "learning_rate": 1.382809803244736e-05, "loss": 0.4662, "step": 1216 }, { "epoch": 1.1342031686859273, "grad_norm": 0.1539094627857667, "learning_rate": 1.3821194338971351e-05, "loss": 0.4638, "step": 1217 }, { "epoch": 1.135135135135135, "grad_norm": 0.16694666006400746, "learning_rate": 1.381429064549534e-05, "loss": 0.4727, "step": 1218 }, { "epoch": 1.1360671015843429, "grad_norm": 0.15645806123584732, "learning_rate": 1.3807386952019331e-05, "loss": 0.4601, "step": 1219 }, { "epoch": 1.1369990680335509, "grad_norm": 0.16312465411088994, "learning_rate": 1.3800483258543322e-05, "loss": 0.4967, "step": 1220 }, { "epoch": 1.1379310344827587, "grad_norm": 0.1630463830239012, "learning_rate": 1.3793579565067313e-05, "loss": 0.4978, "step": 1221 }, { "epoch": 1.1388630009319665, "grad_norm": 0.1442978888926596, "learning_rate": 1.3786675871591302e-05, "loss": 0.4493, "step": 1222 }, { "epoch": 1.1397949673811743, "grad_norm": 0.16058889574228338, "learning_rate": 1.3779772178115294e-05, "loss": 0.4638, "step": 1223 }, { "epoch": 1.140726933830382, "grad_norm": 0.15028495905016462, "learning_rate": 1.3772868484639283e-05, "loss": 0.459, "step": 1224 }, { "epoch": 1.14165890027959, "grad_norm": 0.1562381129939285, "learning_rate": 1.3765964791163274e-05, "loss": 0.4911, "step": 1225 }, { "epoch": 1.1425908667287978, "grad_norm": 0.15569890358500937, "learning_rate": 1.3759061097687263e-05, "loss": 0.5047, "step": 1226 }, { "epoch": 1.1435228331780056, "grad_norm": 0.14950655945911917, "learning_rate": 1.3752157404211255e-05, "loss": 0.4677, "step": 1227 }, { "epoch": 1.1444547996272134, "grad_norm": 0.16792920860988592, "learning_rate": 1.3745253710735244e-05, "loss": 0.4956, "step": 1228 }, { "epoch": 1.1453867660764212, "grad_norm": 0.16069762132686638, "learning_rate": 1.3738350017259235e-05, "loss": 0.4451, "step": 1229 }, { "epoch": 1.146318732525629, "grad_norm": 0.1480841283763984, "learning_rate": 1.3731446323783224e-05, "loss": 0.4691, "step": 1230 }, { "epoch": 1.147250698974837, "grad_norm": 0.15577456381608126, "learning_rate": 1.3724542630307217e-05, "loss": 0.5084, "step": 1231 }, { "epoch": 1.1481826654240448, "grad_norm": 0.1508872321005036, "learning_rate": 1.3717638936831206e-05, "loss": 0.4421, "step": 1232 }, { "epoch": 1.1491146318732526, "grad_norm": 0.1546241651320537, "learning_rate": 1.3710735243355195e-05, "loss": 0.4615, "step": 1233 }, { "epoch": 1.1500465983224604, "grad_norm": 0.15525996868527017, "learning_rate": 1.3703831549879187e-05, "loss": 0.4754, "step": 1234 }, { "epoch": 1.1509785647716682, "grad_norm": 0.16253559615738938, "learning_rate": 1.3696927856403178e-05, "loss": 0.4759, "step": 1235 }, { "epoch": 1.1519105312208762, "grad_norm": 0.1590027708820626, "learning_rate": 1.3690024162927167e-05, "loss": 0.468, "step": 1236 }, { "epoch": 1.152842497670084, "grad_norm": 0.1450262651090182, "learning_rate": 1.3683120469451156e-05, "loss": 0.4632, "step": 1237 }, { "epoch": 1.1537744641192917, "grad_norm": 0.15126316984182714, "learning_rate": 1.3676216775975148e-05, "loss": 0.4719, "step": 1238 }, { "epoch": 1.1547064305684995, "grad_norm": 0.16301472741098263, "learning_rate": 1.3669313082499137e-05, "loss": 0.4697, "step": 1239 }, { "epoch": 1.1556383970177073, "grad_norm": 0.15954353361897772, "learning_rate": 1.3662409389023128e-05, "loss": 0.475, "step": 1240 }, { "epoch": 1.156570363466915, "grad_norm": 0.16612744891114273, "learning_rate": 1.365550569554712e-05, "loss": 0.492, "step": 1241 }, { "epoch": 1.157502329916123, "grad_norm": 0.14983174723837667, "learning_rate": 1.364860200207111e-05, "loss": 0.4688, "step": 1242 }, { "epoch": 1.158434296365331, "grad_norm": 0.1555327160291559, "learning_rate": 1.3641698308595099e-05, "loss": 0.4942, "step": 1243 }, { "epoch": 1.1593662628145387, "grad_norm": 0.15855549453883377, "learning_rate": 1.363479461511909e-05, "loss": 0.4915, "step": 1244 }, { "epoch": 1.1602982292637465, "grad_norm": 0.150300663952317, "learning_rate": 1.3627890921643082e-05, "loss": 0.463, "step": 1245 }, { "epoch": 1.1612301957129543, "grad_norm": 0.17198410730875188, "learning_rate": 1.362098722816707e-05, "loss": 0.4735, "step": 1246 }, { "epoch": 1.1621621621621623, "grad_norm": 0.17896519880756578, "learning_rate": 1.361408353469106e-05, "loss": 0.4962, "step": 1247 }, { "epoch": 1.16309412861137, "grad_norm": 0.15941953682091234, "learning_rate": 1.360717984121505e-05, "loss": 0.473, "step": 1248 }, { "epoch": 1.1640260950605779, "grad_norm": 0.1590541974632757, "learning_rate": 1.3600276147739041e-05, "loss": 0.4635, "step": 1249 }, { "epoch": 1.1649580615097856, "grad_norm": 0.16440440246248614, "learning_rate": 1.3593372454263032e-05, "loss": 0.4426, "step": 1250 }, { "epoch": 1.1658900279589934, "grad_norm": 0.1623488576961562, "learning_rate": 1.3586468760787021e-05, "loss": 0.4432, "step": 1251 }, { "epoch": 1.1668219944082012, "grad_norm": 0.15298205974172263, "learning_rate": 1.3579565067311014e-05, "loss": 0.4411, "step": 1252 }, { "epoch": 1.167753960857409, "grad_norm": 0.17851250350411962, "learning_rate": 1.3572661373835003e-05, "loss": 0.4555, "step": 1253 }, { "epoch": 1.168685927306617, "grad_norm": 0.1540052799191384, "learning_rate": 1.3565757680358993e-05, "loss": 0.4509, "step": 1254 }, { "epoch": 1.1696178937558248, "grad_norm": 0.16445551904017314, "learning_rate": 1.3558853986882982e-05, "loss": 0.4512, "step": 1255 }, { "epoch": 1.1705498602050326, "grad_norm": 0.1612020129602718, "learning_rate": 1.3551950293406975e-05, "loss": 0.4761, "step": 1256 }, { "epoch": 1.1714818266542404, "grad_norm": 0.14919283802746727, "learning_rate": 1.3545046599930964e-05, "loss": 0.4645, "step": 1257 }, { "epoch": 1.1724137931034484, "grad_norm": 0.15087238568301767, "learning_rate": 1.3538142906454955e-05, "loss": 0.4462, "step": 1258 }, { "epoch": 1.1733457595526562, "grad_norm": 0.15973233274943127, "learning_rate": 1.3531239212978944e-05, "loss": 0.4936, "step": 1259 }, { "epoch": 1.174277726001864, "grad_norm": 0.14916581775413992, "learning_rate": 1.3524335519502936e-05, "loss": 0.4611, "step": 1260 }, { "epoch": 1.1752096924510718, "grad_norm": 0.16334516593470635, "learning_rate": 1.3517431826026925e-05, "loss": 0.4605, "step": 1261 }, { "epoch": 1.1761416589002796, "grad_norm": 0.15232794831868668, "learning_rate": 1.3510528132550916e-05, "loss": 0.4595, "step": 1262 }, { "epoch": 1.1770736253494873, "grad_norm": 0.15037529416815085, "learning_rate": 1.3503624439074907e-05, "loss": 0.4481, "step": 1263 }, { "epoch": 1.1780055917986951, "grad_norm": 0.1689452278706572, "learning_rate": 1.3496720745598897e-05, "loss": 0.4821, "step": 1264 }, { "epoch": 1.1789375582479031, "grad_norm": 0.16423978105529946, "learning_rate": 1.3489817052122886e-05, "loss": 0.4586, "step": 1265 }, { "epoch": 1.179869524697111, "grad_norm": 0.15408066399862866, "learning_rate": 1.3482913358646877e-05, "loss": 0.4797, "step": 1266 }, { "epoch": 1.1808014911463187, "grad_norm": 0.1784226557454893, "learning_rate": 1.3476009665170868e-05, "loss": 0.4919, "step": 1267 }, { "epoch": 1.1817334575955265, "grad_norm": 0.16498360410187474, "learning_rate": 1.3469105971694858e-05, "loss": 0.4806, "step": 1268 }, { "epoch": 1.1826654240447343, "grad_norm": 0.15437358445296873, "learning_rate": 1.3462202278218848e-05, "loss": 0.4876, "step": 1269 }, { "epoch": 1.1835973904939423, "grad_norm": 0.15232423704485204, "learning_rate": 1.3455298584742838e-05, "loss": 0.457, "step": 1270 }, { "epoch": 1.18452935694315, "grad_norm": 0.15879058325745665, "learning_rate": 1.3448394891266829e-05, "loss": 0.4963, "step": 1271 }, { "epoch": 1.1854613233923579, "grad_norm": 0.1520835430530478, "learning_rate": 1.344149119779082e-05, "loss": 0.4724, "step": 1272 }, { "epoch": 1.1863932898415657, "grad_norm": 0.15149061059590066, "learning_rate": 1.3434587504314809e-05, "loss": 0.4361, "step": 1273 }, { "epoch": 1.1873252562907735, "grad_norm": 0.1627861126307535, "learning_rate": 1.3427683810838801e-05, "loss": 0.4554, "step": 1274 }, { "epoch": 1.1882572227399812, "grad_norm": 0.1639184658061794, "learning_rate": 1.342078011736279e-05, "loss": 0.4698, "step": 1275 }, { "epoch": 1.1891891891891893, "grad_norm": 0.14702246399831653, "learning_rate": 1.3413876423886781e-05, "loss": 0.4822, "step": 1276 }, { "epoch": 1.190121155638397, "grad_norm": 0.2070976338544847, "learning_rate": 1.340697273041077e-05, "loss": 0.503, "step": 1277 }, { "epoch": 1.1910531220876048, "grad_norm": 0.17187878472703827, "learning_rate": 1.3400069036934762e-05, "loss": 0.4972, "step": 1278 }, { "epoch": 1.1919850885368126, "grad_norm": 0.15092008615721322, "learning_rate": 1.3393165343458751e-05, "loss": 0.4606, "step": 1279 }, { "epoch": 1.1929170549860204, "grad_norm": 0.15181242741004985, "learning_rate": 1.3386261649982742e-05, "loss": 0.4596, "step": 1280 }, { "epoch": 1.1938490214352284, "grad_norm": 0.1866475670530004, "learning_rate": 1.3379357956506733e-05, "loss": 0.4774, "step": 1281 }, { "epoch": 1.1947809878844362, "grad_norm": 0.15387477047684775, "learning_rate": 1.3372454263030724e-05, "loss": 0.4628, "step": 1282 }, { "epoch": 1.195712954333644, "grad_norm": 0.1636168111337718, "learning_rate": 1.3365550569554713e-05, "loss": 0.4929, "step": 1283 }, { "epoch": 1.1966449207828518, "grad_norm": 0.16659581799676376, "learning_rate": 1.3358646876078702e-05, "loss": 0.4845, "step": 1284 }, { "epoch": 1.1975768872320596, "grad_norm": 0.15329913837367182, "learning_rate": 1.3351743182602694e-05, "loss": 0.4796, "step": 1285 }, { "epoch": 1.1985088536812674, "grad_norm": 0.1518918124291804, "learning_rate": 1.3344839489126685e-05, "loss": 0.4985, "step": 1286 }, { "epoch": 1.1994408201304754, "grad_norm": 0.15861376287700382, "learning_rate": 1.3337935795650674e-05, "loss": 0.4365, "step": 1287 }, { "epoch": 1.2003727865796832, "grad_norm": 0.14872647171661396, "learning_rate": 1.3331032102174663e-05, "loss": 0.4629, "step": 1288 }, { "epoch": 1.201304753028891, "grad_norm": 0.14350848283434162, "learning_rate": 1.3324128408698655e-05, "loss": 0.4483, "step": 1289 }, { "epoch": 1.2022367194780987, "grad_norm": 0.16571688762999784, "learning_rate": 1.3317224715222644e-05, "loss": 0.4661, "step": 1290 }, { "epoch": 1.2031686859273065, "grad_norm": 0.15741170817155717, "learning_rate": 1.3310321021746635e-05, "loss": 0.4863, "step": 1291 }, { "epoch": 1.2041006523765145, "grad_norm": 0.1443550505282064, "learning_rate": 1.3303417328270628e-05, "loss": 0.4292, "step": 1292 }, { "epoch": 1.2050326188257223, "grad_norm": 0.16676175341819907, "learning_rate": 1.3296513634794617e-05, "loss": 0.483, "step": 1293 }, { "epoch": 1.2059645852749301, "grad_norm": 0.18193955755380276, "learning_rate": 1.3289609941318606e-05, "loss": 0.5056, "step": 1294 }, { "epoch": 1.206896551724138, "grad_norm": 0.15176684971321236, "learning_rate": 1.3282706247842596e-05, "loss": 0.4522, "step": 1295 }, { "epoch": 1.2078285181733457, "grad_norm": 0.1625345163265397, "learning_rate": 1.3275802554366589e-05, "loss": 0.4868, "step": 1296 }, { "epoch": 1.2087604846225535, "grad_norm": 0.17040204117358893, "learning_rate": 1.3268898860890578e-05, "loss": 0.471, "step": 1297 }, { "epoch": 1.2096924510717615, "grad_norm": 0.16753040993298707, "learning_rate": 1.3261995167414567e-05, "loss": 0.4587, "step": 1298 }, { "epoch": 1.2106244175209693, "grad_norm": 0.16240287343148477, "learning_rate": 1.3255091473938558e-05, "loss": 0.4808, "step": 1299 }, { "epoch": 1.211556383970177, "grad_norm": 0.17418422652367585, "learning_rate": 1.3248187780462548e-05, "loss": 0.4964, "step": 1300 }, { "epoch": 1.2124883504193849, "grad_norm": 0.15641670370982008, "learning_rate": 1.3241284086986539e-05, "loss": 0.4849, "step": 1301 }, { "epoch": 1.2134203168685926, "grad_norm": 0.15893491239984306, "learning_rate": 1.3234380393510528e-05, "loss": 0.4907, "step": 1302 }, { "epoch": 1.2143522833178007, "grad_norm": 0.14376310116026708, "learning_rate": 1.322747670003452e-05, "loss": 0.4593, "step": 1303 }, { "epoch": 1.2152842497670084, "grad_norm": 0.1523047237105535, "learning_rate": 1.322057300655851e-05, "loss": 0.4469, "step": 1304 }, { "epoch": 1.2162162162162162, "grad_norm": 0.15946235575405507, "learning_rate": 1.32136693130825e-05, "loss": 0.4863, "step": 1305 }, { "epoch": 1.217148182665424, "grad_norm": 0.154068929125921, "learning_rate": 1.320676561960649e-05, "loss": 0.4752, "step": 1306 }, { "epoch": 1.2180801491146318, "grad_norm": 0.1586533346783499, "learning_rate": 1.3199861926130482e-05, "loss": 0.4692, "step": 1307 }, { "epoch": 1.2190121155638396, "grad_norm": 0.16466894663356552, "learning_rate": 1.3192958232654471e-05, "loss": 0.4573, "step": 1308 }, { "epoch": 1.2199440820130476, "grad_norm": 0.15975859171500184, "learning_rate": 1.3186054539178462e-05, "loss": 0.4638, "step": 1309 }, { "epoch": 1.2208760484622554, "grad_norm": 0.15100657850971944, "learning_rate": 1.317915084570245e-05, "loss": 0.4476, "step": 1310 }, { "epoch": 1.2218080149114632, "grad_norm": 0.1805517364999586, "learning_rate": 1.3172247152226443e-05, "loss": 0.4886, "step": 1311 }, { "epoch": 1.222739981360671, "grad_norm": 0.1849859273331578, "learning_rate": 1.3165343458750432e-05, "loss": 0.5187, "step": 1312 }, { "epoch": 1.2236719478098788, "grad_norm": 0.1510497878553946, "learning_rate": 1.3158439765274423e-05, "loss": 0.435, "step": 1313 }, { "epoch": 1.2246039142590868, "grad_norm": 0.16031288812479177, "learning_rate": 1.3151536071798414e-05, "loss": 0.5002, "step": 1314 }, { "epoch": 1.2255358807082946, "grad_norm": 0.16630216257695277, "learning_rate": 1.3144632378322404e-05, "loss": 0.5073, "step": 1315 }, { "epoch": 1.2264678471575023, "grad_norm": 0.14983286050884712, "learning_rate": 1.3137728684846393e-05, "loss": 0.4555, "step": 1316 }, { "epoch": 1.2273998136067101, "grad_norm": 0.16666028431578012, "learning_rate": 1.3130824991370384e-05, "loss": 0.4663, "step": 1317 }, { "epoch": 1.228331780055918, "grad_norm": 0.14806779460286168, "learning_rate": 1.3123921297894375e-05, "loss": 0.4583, "step": 1318 }, { "epoch": 1.2292637465051257, "grad_norm": 0.15381405143058027, "learning_rate": 1.3117017604418366e-05, "loss": 0.4739, "step": 1319 }, { "epoch": 1.2301957129543337, "grad_norm": 0.15316956554759234, "learning_rate": 1.3110113910942355e-05, "loss": 0.4707, "step": 1320 }, { "epoch": 1.2311276794035415, "grad_norm": 0.17262322276100045, "learning_rate": 1.3103210217466345e-05, "loss": 0.5072, "step": 1321 }, { "epoch": 1.2320596458527493, "grad_norm": 0.15090108761451423, "learning_rate": 1.3096306523990336e-05, "loss": 0.4686, "step": 1322 }, { "epoch": 1.232991612301957, "grad_norm": 0.15684696511149165, "learning_rate": 1.3089402830514327e-05, "loss": 0.4778, "step": 1323 }, { "epoch": 1.2339235787511649, "grad_norm": 0.15851468718829262, "learning_rate": 1.3082499137038316e-05, "loss": 0.4873, "step": 1324 }, { "epoch": 1.2348555452003729, "grad_norm": 0.1470804108478301, "learning_rate": 1.3075595443562308e-05, "loss": 0.4613, "step": 1325 }, { "epoch": 1.2357875116495807, "grad_norm": 0.14825733718051232, "learning_rate": 1.3068691750086297e-05, "loss": 0.464, "step": 1326 }, { "epoch": 1.2367194780987885, "grad_norm": 0.15366896422922638, "learning_rate": 1.3061788056610288e-05, "loss": 0.4654, "step": 1327 }, { "epoch": 1.2376514445479962, "grad_norm": 0.17479268882014876, "learning_rate": 1.3054884363134277e-05, "loss": 0.4745, "step": 1328 }, { "epoch": 1.238583410997204, "grad_norm": 0.1622149732324725, "learning_rate": 1.304798066965827e-05, "loss": 0.4785, "step": 1329 }, { "epoch": 1.2395153774464118, "grad_norm": 0.14813248002973578, "learning_rate": 1.3041076976182258e-05, "loss": 0.458, "step": 1330 }, { "epoch": 1.2404473438956198, "grad_norm": 0.1515564750067901, "learning_rate": 1.3034173282706248e-05, "loss": 0.4715, "step": 1331 }, { "epoch": 1.2413793103448276, "grad_norm": 0.14511974477141684, "learning_rate": 1.302726958923024e-05, "loss": 0.452, "step": 1332 }, { "epoch": 1.2423112767940354, "grad_norm": 0.1438003821491688, "learning_rate": 1.302036589575423e-05, "loss": 0.4534, "step": 1333 }, { "epoch": 1.2432432432432432, "grad_norm": 0.15600719421020595, "learning_rate": 1.301346220227822e-05, "loss": 0.4675, "step": 1334 }, { "epoch": 1.244175209692451, "grad_norm": 0.1522745654591199, "learning_rate": 1.3006558508802209e-05, "loss": 0.4798, "step": 1335 }, { "epoch": 1.245107176141659, "grad_norm": 0.14581011825212584, "learning_rate": 1.2999654815326201e-05, "loss": 0.4552, "step": 1336 }, { "epoch": 1.2460391425908668, "grad_norm": 0.151685572656042, "learning_rate": 1.2992751121850192e-05, "loss": 0.4667, "step": 1337 }, { "epoch": 1.2469711090400746, "grad_norm": 0.15083551794103953, "learning_rate": 1.2985847428374181e-05, "loss": 0.4548, "step": 1338 }, { "epoch": 1.2479030754892824, "grad_norm": 0.1575401777527269, "learning_rate": 1.297894373489817e-05, "loss": 0.5017, "step": 1339 }, { "epoch": 1.2488350419384902, "grad_norm": 0.16319973726982978, "learning_rate": 1.2972040041422162e-05, "loss": 0.4828, "step": 1340 }, { "epoch": 1.249767008387698, "grad_norm": 0.1743428843544405, "learning_rate": 1.2965136347946151e-05, "loss": 0.4398, "step": 1341 }, { "epoch": 1.250698974836906, "grad_norm": 0.1523421214541881, "learning_rate": 1.2958232654470142e-05, "loss": 0.4742, "step": 1342 }, { "epoch": 1.2516309412861137, "grad_norm": 0.1455968442626656, "learning_rate": 1.2951328960994135e-05, "loss": 0.4456, "step": 1343 }, { "epoch": 1.2525629077353215, "grad_norm": 0.17228913218907604, "learning_rate": 1.2944425267518124e-05, "loss": 0.4499, "step": 1344 }, { "epoch": 1.2534948741845293, "grad_norm": 0.15016205128040552, "learning_rate": 1.2937521574042113e-05, "loss": 0.4636, "step": 1345 }, { "epoch": 1.254426840633737, "grad_norm": 0.15683467583825947, "learning_rate": 1.2930617880566103e-05, "loss": 0.4605, "step": 1346 }, { "epoch": 1.2553588070829451, "grad_norm": 0.15909172376676797, "learning_rate": 1.2923714187090096e-05, "loss": 0.4542, "step": 1347 }, { "epoch": 1.256290773532153, "grad_norm": 0.155944271860575, "learning_rate": 1.2916810493614085e-05, "loss": 0.4851, "step": 1348 }, { "epoch": 1.2572227399813607, "grad_norm": 0.17382531586528463, "learning_rate": 1.2909906800138074e-05, "loss": 0.4845, "step": 1349 }, { "epoch": 1.2581547064305685, "grad_norm": 0.16996282699749446, "learning_rate": 1.2903003106662065e-05, "loss": 0.4597, "step": 1350 }, { "epoch": 1.2590866728797763, "grad_norm": 0.1687614606467213, "learning_rate": 1.2896099413186055e-05, "loss": 0.4846, "step": 1351 }, { "epoch": 1.260018639328984, "grad_norm": 0.15658899544904137, "learning_rate": 1.2889195719710046e-05, "loss": 0.4745, "step": 1352 }, { "epoch": 1.2609506057781918, "grad_norm": 0.17761078891436377, "learning_rate": 1.2882292026234035e-05, "loss": 0.496, "step": 1353 }, { "epoch": 1.2618825722273999, "grad_norm": 0.17240067191580075, "learning_rate": 1.2875388332758028e-05, "loss": 0.4977, "step": 1354 }, { "epoch": 1.2628145386766076, "grad_norm": 0.1535162381333331, "learning_rate": 1.2868484639282017e-05, "loss": 0.4647, "step": 1355 }, { "epoch": 1.2637465051258154, "grad_norm": 0.16827247273289944, "learning_rate": 1.2861580945806007e-05, "loss": 0.5049, "step": 1356 }, { "epoch": 1.2646784715750232, "grad_norm": 0.14960710784950218, "learning_rate": 1.2854677252329996e-05, "loss": 0.458, "step": 1357 }, { "epoch": 1.2656104380242312, "grad_norm": 0.14922814827403405, "learning_rate": 1.2847773558853989e-05, "loss": 0.4746, "step": 1358 }, { "epoch": 1.266542404473439, "grad_norm": 0.15746458264107965, "learning_rate": 1.2840869865377978e-05, "loss": 0.4337, "step": 1359 }, { "epoch": 1.2674743709226468, "grad_norm": 0.1563770345114773, "learning_rate": 1.2833966171901969e-05, "loss": 0.4615, "step": 1360 }, { "epoch": 1.2684063373718546, "grad_norm": 0.15421789840662753, "learning_rate": 1.2827062478425958e-05, "loss": 0.4885, "step": 1361 }, { "epoch": 1.2693383038210624, "grad_norm": 0.14221277052015255, "learning_rate": 1.282015878494995e-05, "loss": 0.4622, "step": 1362 }, { "epoch": 1.2702702702702702, "grad_norm": 0.14532731905369667, "learning_rate": 1.2813255091473939e-05, "loss": 0.4471, "step": 1363 }, { "epoch": 1.271202236719478, "grad_norm": 0.15740718705384935, "learning_rate": 1.280635139799793e-05, "loss": 0.4598, "step": 1364 }, { "epoch": 1.272134203168686, "grad_norm": 0.15497229311241245, "learning_rate": 1.279944770452192e-05, "loss": 0.4706, "step": 1365 }, { "epoch": 1.2730661696178938, "grad_norm": 0.15107635027226438, "learning_rate": 1.2792544011045911e-05, "loss": 0.4753, "step": 1366 }, { "epoch": 1.2739981360671015, "grad_norm": 0.16242589785939324, "learning_rate": 1.27856403175699e-05, "loss": 0.4685, "step": 1367 }, { "epoch": 1.2749301025163093, "grad_norm": 0.20258319007118528, "learning_rate": 1.2778736624093891e-05, "loss": 0.4954, "step": 1368 }, { "epoch": 1.2758620689655173, "grad_norm": 0.1568982938968355, "learning_rate": 1.2771832930617882e-05, "loss": 0.4592, "step": 1369 }, { "epoch": 1.2767940354147251, "grad_norm": 0.16106821597302934, "learning_rate": 1.2764929237141873e-05, "loss": 0.4692, "step": 1370 }, { "epoch": 1.277726001863933, "grad_norm": 0.1556697873596116, "learning_rate": 1.2758025543665862e-05, "loss": 0.4651, "step": 1371 }, { "epoch": 1.2786579683131407, "grad_norm": 0.164059496655891, "learning_rate": 1.2751121850189854e-05, "loss": 0.469, "step": 1372 }, { "epoch": 1.2795899347623485, "grad_norm": 0.15314029067338206, "learning_rate": 1.2744218156713843e-05, "loss": 0.4431, "step": 1373 }, { "epoch": 1.2805219012115563, "grad_norm": 0.1507003262794299, "learning_rate": 1.2737314463237834e-05, "loss": 0.4743, "step": 1374 }, { "epoch": 1.281453867660764, "grad_norm": 0.15603402158315452, "learning_rate": 1.2730410769761823e-05, "loss": 0.4438, "step": 1375 }, { "epoch": 1.282385834109972, "grad_norm": 0.1658041152173838, "learning_rate": 1.2723507076285815e-05, "loss": 0.4788, "step": 1376 }, { "epoch": 1.2833178005591799, "grad_norm": 0.15843230926728055, "learning_rate": 1.2716603382809804e-05, "loss": 0.4421, "step": 1377 }, { "epoch": 1.2842497670083877, "grad_norm": 0.15685628218481953, "learning_rate": 1.2709699689333795e-05, "loss": 0.4802, "step": 1378 }, { "epoch": 1.2851817334575955, "grad_norm": 0.16681775407003768, "learning_rate": 1.2702795995857784e-05, "loss": 0.4746, "step": 1379 }, { "epoch": 1.2861136999068035, "grad_norm": 0.1661964355782647, "learning_rate": 1.2695892302381776e-05, "loss": 0.5009, "step": 1380 }, { "epoch": 1.2870456663560113, "grad_norm": 0.1602535379605942, "learning_rate": 1.2688988608905766e-05, "loss": 0.4767, "step": 1381 }, { "epoch": 1.287977632805219, "grad_norm": 0.15351512045718102, "learning_rate": 1.2682084915429755e-05, "loss": 0.468, "step": 1382 }, { "epoch": 1.2889095992544268, "grad_norm": 0.16029811161301263, "learning_rate": 1.2675181221953747e-05, "loss": 0.493, "step": 1383 }, { "epoch": 1.2898415657036346, "grad_norm": 0.1453001215751867, "learning_rate": 1.2668277528477738e-05, "loss": 0.4762, "step": 1384 }, { "epoch": 1.2907735321528424, "grad_norm": 0.159064208220918, "learning_rate": 1.2661373835001727e-05, "loss": 0.4787, "step": 1385 }, { "epoch": 1.2917054986020502, "grad_norm": 0.17199137198085773, "learning_rate": 1.2654470141525716e-05, "loss": 0.4585, "step": 1386 }, { "epoch": 1.2926374650512582, "grad_norm": 0.15630754064550592, "learning_rate": 1.2647566448049708e-05, "loss": 0.4604, "step": 1387 }, { "epoch": 1.293569431500466, "grad_norm": 0.15749924860533285, "learning_rate": 1.2640662754573699e-05, "loss": 0.4823, "step": 1388 }, { "epoch": 1.2945013979496738, "grad_norm": 0.15842597887116103, "learning_rate": 1.2633759061097688e-05, "loss": 0.4705, "step": 1389 }, { "epoch": 1.2954333643988816, "grad_norm": 0.16357355015309827, "learning_rate": 1.2626855367621677e-05, "loss": 0.4613, "step": 1390 }, { "epoch": 1.2963653308480896, "grad_norm": 0.15401507146826898, "learning_rate": 1.261995167414567e-05, "loss": 0.4542, "step": 1391 }, { "epoch": 1.2972972972972974, "grad_norm": 0.15044601599539206, "learning_rate": 1.2613047980669658e-05, "loss": 0.4747, "step": 1392 }, { "epoch": 1.2982292637465052, "grad_norm": 0.17176821901500805, "learning_rate": 1.260614428719365e-05, "loss": 0.4788, "step": 1393 }, { "epoch": 1.299161230195713, "grad_norm": 0.16185884937488906, "learning_rate": 1.2599240593717642e-05, "loss": 0.4904, "step": 1394 }, { "epoch": 1.3000931966449207, "grad_norm": 0.15745684183810674, "learning_rate": 1.259233690024163e-05, "loss": 0.4693, "step": 1395 }, { "epoch": 1.3010251630941285, "grad_norm": 0.16301218351260022, "learning_rate": 1.258543320676562e-05, "loss": 0.4648, "step": 1396 }, { "epoch": 1.3019571295433363, "grad_norm": 0.14813366337602493, "learning_rate": 1.257852951328961e-05, "loss": 0.4718, "step": 1397 }, { "epoch": 1.3028890959925443, "grad_norm": 0.16300138696287808, "learning_rate": 1.2571625819813603e-05, "loss": 0.4571, "step": 1398 }, { "epoch": 1.303821062441752, "grad_norm": 0.1591117834410047, "learning_rate": 1.2564722126337592e-05, "loss": 0.4599, "step": 1399 }, { "epoch": 1.30475302889096, "grad_norm": 0.1594771297528884, "learning_rate": 1.2557818432861581e-05, "loss": 0.4709, "step": 1400 }, { "epoch": 1.3056849953401677, "grad_norm": 0.15601247872747828, "learning_rate": 1.2550914739385572e-05, "loss": 0.4799, "step": 1401 }, { "epoch": 1.3066169617893757, "grad_norm": 0.1521335216886732, "learning_rate": 1.2544011045909562e-05, "loss": 0.5037, "step": 1402 }, { "epoch": 1.3075489282385835, "grad_norm": 0.14959748812449358, "learning_rate": 1.2537107352433553e-05, "loss": 0.4451, "step": 1403 }, { "epoch": 1.3084808946877913, "grad_norm": 0.17510269471133555, "learning_rate": 1.2530203658957542e-05, "loss": 0.4897, "step": 1404 }, { "epoch": 1.309412861136999, "grad_norm": 0.1514449559888547, "learning_rate": 1.2523299965481535e-05, "loss": 0.467, "step": 1405 }, { "epoch": 1.3103448275862069, "grad_norm": 0.14589844954679823, "learning_rate": 1.2516396272005524e-05, "loss": 0.4633, "step": 1406 }, { "epoch": 1.3112767940354146, "grad_norm": 0.15859706129081563, "learning_rate": 1.2509492578529514e-05, "loss": 0.4515, "step": 1407 }, { "epoch": 1.3122087604846224, "grad_norm": 0.15605172760144959, "learning_rate": 1.2502588885053503e-05, "loss": 0.4652, "step": 1408 }, { "epoch": 1.3131407269338304, "grad_norm": 0.16495167398083294, "learning_rate": 1.2495685191577496e-05, "loss": 0.4577, "step": 1409 }, { "epoch": 1.3140726933830382, "grad_norm": 0.15732400429741383, "learning_rate": 1.2488781498101485e-05, "loss": 0.4886, "step": 1410 }, { "epoch": 1.315004659832246, "grad_norm": 0.15579904608980588, "learning_rate": 1.2481877804625476e-05, "loss": 0.4684, "step": 1411 }, { "epoch": 1.3159366262814538, "grad_norm": 0.14697963002167433, "learning_rate": 1.2474974111149465e-05, "loss": 0.4477, "step": 1412 }, { "epoch": 1.3168685927306618, "grad_norm": 0.15660011793797707, "learning_rate": 1.2468070417673457e-05, "loss": 0.461, "step": 1413 }, { "epoch": 1.3178005591798696, "grad_norm": 0.16364242688747996, "learning_rate": 1.2461166724197446e-05, "loss": 0.4936, "step": 1414 }, { "epoch": 1.3187325256290774, "grad_norm": 0.15283312386030176, "learning_rate": 1.2454263030721437e-05, "loss": 0.4772, "step": 1415 }, { "epoch": 1.3196644920782852, "grad_norm": 0.1745995508906787, "learning_rate": 1.2447359337245428e-05, "loss": 0.449, "step": 1416 }, { "epoch": 1.320596458527493, "grad_norm": 0.13730432377489898, "learning_rate": 1.2440455643769418e-05, "loss": 0.4305, "step": 1417 }, { "epoch": 1.3215284249767008, "grad_norm": 0.15516495733976168, "learning_rate": 1.2433551950293407e-05, "loss": 0.4697, "step": 1418 }, { "epoch": 1.3224603914259085, "grad_norm": 0.17679229974799712, "learning_rate": 1.2426648256817398e-05, "loss": 0.4514, "step": 1419 }, { "epoch": 1.3233923578751166, "grad_norm": 0.15207070768255077, "learning_rate": 1.2419744563341389e-05, "loss": 0.4632, "step": 1420 }, { "epoch": 1.3243243243243243, "grad_norm": 0.14633331409282785, "learning_rate": 1.241284086986538e-05, "loss": 0.4423, "step": 1421 }, { "epoch": 1.3252562907735321, "grad_norm": 0.15367061246563424, "learning_rate": 1.2405937176389369e-05, "loss": 0.4672, "step": 1422 }, { "epoch": 1.32618825722274, "grad_norm": 0.1761111947031513, "learning_rate": 1.2399033482913361e-05, "loss": 0.4819, "step": 1423 }, { "epoch": 1.327120223671948, "grad_norm": 0.15645576465067315, "learning_rate": 1.239212978943735e-05, "loss": 0.4607, "step": 1424 }, { "epoch": 1.3280521901211557, "grad_norm": 0.15691337254095952, "learning_rate": 1.238522609596134e-05, "loss": 0.4764, "step": 1425 }, { "epoch": 1.3289841565703635, "grad_norm": 0.16234637813034986, "learning_rate": 1.237832240248533e-05, "loss": 0.504, "step": 1426 }, { "epoch": 1.3299161230195713, "grad_norm": 0.16663045719088346, "learning_rate": 1.2371418709009322e-05, "loss": 0.4661, "step": 1427 }, { "epoch": 1.330848089468779, "grad_norm": 0.15481782131910765, "learning_rate": 1.2364515015533311e-05, "loss": 0.4353, "step": 1428 }, { "epoch": 1.3317800559179869, "grad_norm": 0.1592432838035464, "learning_rate": 1.2357611322057302e-05, "loss": 0.4914, "step": 1429 }, { "epoch": 1.3327120223671947, "grad_norm": 0.15222531135781983, "learning_rate": 1.2350707628581291e-05, "loss": 0.4689, "step": 1430 }, { "epoch": 1.3336439888164027, "grad_norm": 0.14817701257716592, "learning_rate": 1.2343803935105284e-05, "loss": 0.4598, "step": 1431 }, { "epoch": 1.3345759552656105, "grad_norm": 0.16979832997772873, "learning_rate": 1.2336900241629273e-05, "loss": 0.4987, "step": 1432 }, { "epoch": 1.3355079217148182, "grad_norm": 0.15744351480646676, "learning_rate": 1.2329996548153262e-05, "loss": 0.4777, "step": 1433 }, { "epoch": 1.336439888164026, "grad_norm": 0.15779833531001647, "learning_rate": 1.2323092854677254e-05, "loss": 0.4872, "step": 1434 }, { "epoch": 1.337371854613234, "grad_norm": 0.15149436720722434, "learning_rate": 1.2316189161201245e-05, "loss": 0.4624, "step": 1435 }, { "epoch": 1.3383038210624418, "grad_norm": 0.1504759650685106, "learning_rate": 1.2309285467725234e-05, "loss": 0.4792, "step": 1436 }, { "epoch": 1.3392357875116496, "grad_norm": 0.1675835171631486, "learning_rate": 1.2302381774249223e-05, "loss": 0.4612, "step": 1437 }, { "epoch": 1.3401677539608574, "grad_norm": 0.15698926498246482, "learning_rate": 1.2295478080773215e-05, "loss": 0.4721, "step": 1438 }, { "epoch": 1.3410997204100652, "grad_norm": 0.1498997564092424, "learning_rate": 1.2288574387297206e-05, "loss": 0.4543, "step": 1439 }, { "epoch": 1.342031686859273, "grad_norm": 0.14933330334989509, "learning_rate": 1.2281670693821195e-05, "loss": 0.4745, "step": 1440 }, { "epoch": 1.3429636533084808, "grad_norm": 0.15637577570321604, "learning_rate": 1.2274767000345184e-05, "loss": 0.4927, "step": 1441 }, { "epoch": 1.3438956197576888, "grad_norm": 0.146736192507584, "learning_rate": 1.2267863306869176e-05, "loss": 0.4528, "step": 1442 }, { "epoch": 1.3448275862068966, "grad_norm": 0.15914607079752616, "learning_rate": 1.2260959613393166e-05, "loss": 0.4803, "step": 1443 }, { "epoch": 1.3457595526561044, "grad_norm": 0.15670936217392667, "learning_rate": 1.2254055919917156e-05, "loss": 0.4974, "step": 1444 }, { "epoch": 1.3466915191053122, "grad_norm": 0.15105971617010114, "learning_rate": 1.2247152226441149e-05, "loss": 0.4458, "step": 1445 }, { "epoch": 1.3476234855545202, "grad_norm": 0.156446985875296, "learning_rate": 1.2240248532965138e-05, "loss": 0.463, "step": 1446 }, { "epoch": 1.348555452003728, "grad_norm": 0.15375597188674722, "learning_rate": 1.2233344839489127e-05, "loss": 0.4384, "step": 1447 }, { "epoch": 1.3494874184529357, "grad_norm": 0.15555290928773643, "learning_rate": 1.2226441146013117e-05, "loss": 0.4722, "step": 1448 }, { "epoch": 1.3504193849021435, "grad_norm": 0.17377293207938752, "learning_rate": 1.221953745253711e-05, "loss": 0.4619, "step": 1449 }, { "epoch": 1.3513513513513513, "grad_norm": 0.148311189455885, "learning_rate": 1.2212633759061099e-05, "loss": 0.4811, "step": 1450 }, { "epoch": 1.352283317800559, "grad_norm": 0.16430399078523938, "learning_rate": 1.2205730065585088e-05, "loss": 0.4791, "step": 1451 }, { "epoch": 1.353215284249767, "grad_norm": 0.1566434865381781, "learning_rate": 1.2198826372109079e-05, "loss": 0.493, "step": 1452 }, { "epoch": 1.354147250698975, "grad_norm": 0.15075221956745122, "learning_rate": 1.219192267863307e-05, "loss": 0.4762, "step": 1453 }, { "epoch": 1.3550792171481827, "grad_norm": 0.1448196460713284, "learning_rate": 1.218501898515706e-05, "loss": 0.4437, "step": 1454 }, { "epoch": 1.3560111835973905, "grad_norm": 0.15931035014857708, "learning_rate": 1.217811529168105e-05, "loss": 0.4592, "step": 1455 }, { "epoch": 1.3569431500465983, "grad_norm": 0.17225811785598805, "learning_rate": 1.2171211598205042e-05, "loss": 0.4632, "step": 1456 }, { "epoch": 1.3578751164958063, "grad_norm": 0.15737687298661646, "learning_rate": 1.216430790472903e-05, "loss": 0.4975, "step": 1457 }, { "epoch": 1.358807082945014, "grad_norm": 0.1684430147733219, "learning_rate": 1.2157404211253021e-05, "loss": 0.4833, "step": 1458 }, { "epoch": 1.3597390493942219, "grad_norm": 0.18291926331369024, "learning_rate": 1.215050051777701e-05, "loss": 0.4988, "step": 1459 }, { "epoch": 1.3606710158434296, "grad_norm": 0.1529183390469512, "learning_rate": 1.2143596824301003e-05, "loss": 0.4466, "step": 1460 }, { "epoch": 1.3616029822926374, "grad_norm": 0.15479130821500942, "learning_rate": 1.2136693130824992e-05, "loss": 0.4693, "step": 1461 }, { "epoch": 1.3625349487418452, "grad_norm": 0.16275253258029482, "learning_rate": 1.2129789437348983e-05, "loss": 0.4632, "step": 1462 }, { "epoch": 1.363466915191053, "grad_norm": 0.17481120942891637, "learning_rate": 1.2122885743872973e-05, "loss": 0.4845, "step": 1463 }, { "epoch": 1.364398881640261, "grad_norm": 0.14985031266698864, "learning_rate": 1.2115982050396964e-05, "loss": 0.4667, "step": 1464 }, { "epoch": 1.3653308480894688, "grad_norm": 0.15517333673778066, "learning_rate": 1.2109078356920953e-05, "loss": 0.4847, "step": 1465 }, { "epoch": 1.3662628145386766, "grad_norm": 0.1587330487203697, "learning_rate": 1.2102174663444944e-05, "loss": 0.475, "step": 1466 }, { "epoch": 1.3671947809878844, "grad_norm": 0.15215492731308047, "learning_rate": 1.2095270969968935e-05, "loss": 0.4534, "step": 1467 }, { "epoch": 1.3681267474370924, "grad_norm": 0.15085775190440412, "learning_rate": 1.2088367276492925e-05, "loss": 0.4571, "step": 1468 }, { "epoch": 1.3690587138863002, "grad_norm": 0.15846312365125798, "learning_rate": 1.2081463583016914e-05, "loss": 0.4714, "step": 1469 }, { "epoch": 1.369990680335508, "grad_norm": 0.1606545333631716, "learning_rate": 1.2074559889540905e-05, "loss": 0.4734, "step": 1470 }, { "epoch": 1.3709226467847158, "grad_norm": 0.15007482454662055, "learning_rate": 1.2067656196064896e-05, "loss": 0.4571, "step": 1471 }, { "epoch": 1.3718546132339235, "grad_norm": 0.15915818201264242, "learning_rate": 1.2060752502588887e-05, "loss": 0.4689, "step": 1472 }, { "epoch": 1.3727865796831313, "grad_norm": 0.15628917206931658, "learning_rate": 1.2053848809112876e-05, "loss": 0.4737, "step": 1473 }, { "epoch": 1.3737185461323391, "grad_norm": 0.15132943039879349, "learning_rate": 1.2046945115636868e-05, "loss": 0.4667, "step": 1474 }, { "epoch": 1.3746505125815471, "grad_norm": 0.14630603165996742, "learning_rate": 1.2040041422160857e-05, "loss": 0.4488, "step": 1475 }, { "epoch": 1.375582479030755, "grad_norm": 0.15809311278935184, "learning_rate": 1.2033137728684848e-05, "loss": 0.4518, "step": 1476 }, { "epoch": 1.3765144454799627, "grad_norm": 0.1528557028661318, "learning_rate": 1.2026234035208837e-05, "loss": 0.427, "step": 1477 }, { "epoch": 1.3774464119291705, "grad_norm": 0.1449391919509678, "learning_rate": 1.201933034173283e-05, "loss": 0.4651, "step": 1478 }, { "epoch": 1.3783783783783785, "grad_norm": 0.15060001877423637, "learning_rate": 1.2012426648256818e-05, "loss": 0.4525, "step": 1479 }, { "epoch": 1.3793103448275863, "grad_norm": 0.1462485060359153, "learning_rate": 1.2005522954780809e-05, "loss": 0.4528, "step": 1480 }, { "epoch": 1.380242311276794, "grad_norm": 0.15128142171164496, "learning_rate": 1.1998619261304798e-05, "loss": 0.4605, "step": 1481 }, { "epoch": 1.3811742777260019, "grad_norm": 0.14735237730460649, "learning_rate": 1.199171556782879e-05, "loss": 0.4636, "step": 1482 }, { "epoch": 1.3821062441752097, "grad_norm": 0.15170047148814458, "learning_rate": 1.198481187435278e-05, "loss": 0.4593, "step": 1483 }, { "epoch": 1.3830382106244175, "grad_norm": 0.17149408067751032, "learning_rate": 1.1977908180876769e-05, "loss": 0.4981, "step": 1484 }, { "epoch": 1.3839701770736252, "grad_norm": 0.14735705330169105, "learning_rate": 1.1971004487400761e-05, "loss": 0.4569, "step": 1485 }, { "epoch": 1.3849021435228333, "grad_norm": 0.15925990636180015, "learning_rate": 1.1964100793924752e-05, "loss": 0.4895, "step": 1486 }, { "epoch": 1.385834109972041, "grad_norm": 0.15382936848428455, "learning_rate": 1.195719710044874e-05, "loss": 0.4722, "step": 1487 }, { "epoch": 1.3867660764212488, "grad_norm": 0.1619967967494574, "learning_rate": 1.195029340697273e-05, "loss": 0.4635, "step": 1488 }, { "epoch": 1.3876980428704566, "grad_norm": 0.15564622889094082, "learning_rate": 1.1943389713496722e-05, "loss": 0.4762, "step": 1489 }, { "epoch": 1.3886300093196646, "grad_norm": 0.15169229737958442, "learning_rate": 1.1936486020020713e-05, "loss": 0.4726, "step": 1490 }, { "epoch": 1.3895619757688724, "grad_norm": 0.15324141252366325, "learning_rate": 1.1929582326544702e-05, "loss": 0.4818, "step": 1491 }, { "epoch": 1.3904939422180802, "grad_norm": 0.15460472780134746, "learning_rate": 1.1922678633068691e-05, "loss": 0.4666, "step": 1492 }, { "epoch": 1.391425908667288, "grad_norm": 0.1592331957007319, "learning_rate": 1.1915774939592684e-05, "loss": 0.4842, "step": 1493 }, { "epoch": 1.3923578751164958, "grad_norm": 0.15119098340816614, "learning_rate": 1.1908871246116673e-05, "loss": 0.48, "step": 1494 }, { "epoch": 1.3932898415657036, "grad_norm": 0.15689560428469043, "learning_rate": 1.1901967552640663e-05, "loss": 0.4975, "step": 1495 }, { "epoch": 1.3942218080149114, "grad_norm": 0.1595261700029944, "learning_rate": 1.1895063859164656e-05, "loss": 0.4843, "step": 1496 }, { "epoch": 1.3951537744641194, "grad_norm": 0.14703853075024886, "learning_rate": 1.1888160165688645e-05, "loss": 0.4847, "step": 1497 }, { "epoch": 1.3960857409133272, "grad_norm": 0.16294656522655657, "learning_rate": 1.1881256472212634e-05, "loss": 0.4589, "step": 1498 }, { "epoch": 1.397017707362535, "grad_norm": 0.17976630451712275, "learning_rate": 1.1874352778736625e-05, "loss": 0.4769, "step": 1499 }, { "epoch": 1.3979496738117427, "grad_norm": 0.16223058604955468, "learning_rate": 1.1867449085260617e-05, "loss": 0.5139, "step": 1500 }, { "epoch": 1.3988816402609507, "grad_norm": 0.15824484175379713, "learning_rate": 1.1860545391784606e-05, "loss": 0.479, "step": 1501 }, { "epoch": 1.3998136067101585, "grad_norm": 0.15084927376653026, "learning_rate": 1.1853641698308595e-05, "loss": 0.455, "step": 1502 }, { "epoch": 1.4007455731593663, "grad_norm": 0.16537193100854944, "learning_rate": 1.1846738004832586e-05, "loss": 0.4671, "step": 1503 }, { "epoch": 1.401677539608574, "grad_norm": 0.1482270974674262, "learning_rate": 1.1839834311356576e-05, "loss": 0.4576, "step": 1504 }, { "epoch": 1.402609506057782, "grad_norm": 0.1719980262721953, "learning_rate": 1.1832930617880567e-05, "loss": 0.5212, "step": 1505 }, { "epoch": 1.4035414725069897, "grad_norm": 0.16337937491465448, "learning_rate": 1.1826026924404556e-05, "loss": 0.4736, "step": 1506 }, { "epoch": 1.4044734389561975, "grad_norm": 0.14924131047514239, "learning_rate": 1.1819123230928549e-05, "loss": 0.4809, "step": 1507 }, { "epoch": 1.4054054054054055, "grad_norm": 0.22015539484630184, "learning_rate": 1.1812219537452538e-05, "loss": 0.455, "step": 1508 }, { "epoch": 1.4063373718546133, "grad_norm": 0.1618897896778329, "learning_rate": 1.1805315843976528e-05, "loss": 0.4703, "step": 1509 }, { "epoch": 1.407269338303821, "grad_norm": 0.16135048607773236, "learning_rate": 1.1798412150500517e-05, "loss": 0.4552, "step": 1510 }, { "epoch": 1.4082013047530288, "grad_norm": 0.15428575676053918, "learning_rate": 1.179150845702451e-05, "loss": 0.4664, "step": 1511 }, { "epoch": 1.4091332712022366, "grad_norm": 0.14732260820088997, "learning_rate": 1.1784604763548499e-05, "loss": 0.4483, "step": 1512 }, { "epoch": 1.4100652376514446, "grad_norm": 0.15829846077218043, "learning_rate": 1.177770107007249e-05, "loss": 0.4717, "step": 1513 }, { "epoch": 1.4109972041006524, "grad_norm": 0.16028089545517032, "learning_rate": 1.177079737659648e-05, "loss": 0.4641, "step": 1514 }, { "epoch": 1.4119291705498602, "grad_norm": 0.1591880382292611, "learning_rate": 1.1763893683120471e-05, "loss": 0.4981, "step": 1515 }, { "epoch": 1.412861136999068, "grad_norm": 0.1521509653782006, "learning_rate": 1.175698998964446e-05, "loss": 0.4526, "step": 1516 }, { "epoch": 1.4137931034482758, "grad_norm": 0.1565844780703632, "learning_rate": 1.1750086296168451e-05, "loss": 0.4747, "step": 1517 }, { "epoch": 1.4147250698974836, "grad_norm": 0.16376347072633965, "learning_rate": 1.1743182602692442e-05, "loss": 0.4738, "step": 1518 }, { "epoch": 1.4156570363466916, "grad_norm": 0.16071993107504878, "learning_rate": 1.1736278909216432e-05, "loss": 0.4721, "step": 1519 }, { "epoch": 1.4165890027958994, "grad_norm": 0.15857886235071647, "learning_rate": 1.1729375215740421e-05, "loss": 0.4597, "step": 1520 }, { "epoch": 1.4175209692451072, "grad_norm": 0.15443078436647403, "learning_rate": 1.1722471522264412e-05, "loss": 0.4655, "step": 1521 }, { "epoch": 1.418452935694315, "grad_norm": 0.156167867779119, "learning_rate": 1.1715567828788403e-05, "loss": 0.4704, "step": 1522 }, { "epoch": 1.4193849021435228, "grad_norm": 0.1509018414817061, "learning_rate": 1.1708664135312394e-05, "loss": 0.4485, "step": 1523 }, { "epoch": 1.4203168685927308, "grad_norm": 0.1628392083043079, "learning_rate": 1.1701760441836383e-05, "loss": 0.4629, "step": 1524 }, { "epoch": 1.4212488350419386, "grad_norm": 0.1583965177800337, "learning_rate": 1.1694856748360375e-05, "loss": 0.4905, "step": 1525 }, { "epoch": 1.4221808014911463, "grad_norm": 0.14866369641812913, "learning_rate": 1.1687953054884364e-05, "loss": 0.4634, "step": 1526 }, { "epoch": 1.4231127679403541, "grad_norm": 0.15304786877354626, "learning_rate": 1.1681049361408355e-05, "loss": 0.468, "step": 1527 }, { "epoch": 1.424044734389562, "grad_norm": 0.148264799885534, "learning_rate": 1.1674145667932344e-05, "loss": 0.4578, "step": 1528 }, { "epoch": 1.4249767008387697, "grad_norm": 0.14385728366412232, "learning_rate": 1.1667241974456336e-05, "loss": 0.4785, "step": 1529 }, { "epoch": 1.4259086672879777, "grad_norm": 0.15972615064442197, "learning_rate": 1.1660338280980325e-05, "loss": 0.459, "step": 1530 }, { "epoch": 1.4268406337371855, "grad_norm": 0.16403980727185988, "learning_rate": 1.1653434587504316e-05, "loss": 0.4578, "step": 1531 }, { "epoch": 1.4277726001863933, "grad_norm": 0.16228963808962718, "learning_rate": 1.1646530894028305e-05, "loss": 0.4617, "step": 1532 }, { "epoch": 1.428704566635601, "grad_norm": 0.15849649317437436, "learning_rate": 1.1639627200552298e-05, "loss": 0.4956, "step": 1533 }, { "epoch": 1.4296365330848089, "grad_norm": 0.1535266978332383, "learning_rate": 1.1632723507076287e-05, "loss": 0.4393, "step": 1534 }, { "epoch": 1.4305684995340169, "grad_norm": 0.15781055942512812, "learning_rate": 1.1625819813600276e-05, "loss": 0.4976, "step": 1535 }, { "epoch": 1.4315004659832247, "grad_norm": 0.1551071430808602, "learning_rate": 1.1618916120124268e-05, "loss": 0.4771, "step": 1536 }, { "epoch": 1.4324324324324325, "grad_norm": 0.1606091539081305, "learning_rate": 1.1612012426648259e-05, "loss": 0.4563, "step": 1537 }, { "epoch": 1.4333643988816402, "grad_norm": 0.1694255074271535, "learning_rate": 1.1605108733172248e-05, "loss": 0.5073, "step": 1538 }, { "epoch": 1.434296365330848, "grad_norm": 0.14809486388219184, "learning_rate": 1.1598205039696237e-05, "loss": 0.4456, "step": 1539 }, { "epoch": 1.4352283317800558, "grad_norm": 0.15021887250351748, "learning_rate": 1.159130134622023e-05, "loss": 0.4241, "step": 1540 }, { "epoch": 1.4361602982292636, "grad_norm": 0.1562951124724361, "learning_rate": 1.158439765274422e-05, "loss": 0.4615, "step": 1541 }, { "epoch": 1.4370922646784716, "grad_norm": 0.1761795088114851, "learning_rate": 1.1577493959268209e-05, "loss": 0.4845, "step": 1542 }, { "epoch": 1.4380242311276794, "grad_norm": 0.16699882170638566, "learning_rate": 1.1570590265792198e-05, "loss": 0.476, "step": 1543 }, { "epoch": 1.4389561975768872, "grad_norm": 0.1574026490748086, "learning_rate": 1.156368657231619e-05, "loss": 0.4568, "step": 1544 }, { "epoch": 1.439888164026095, "grad_norm": 0.16684291221935943, "learning_rate": 1.155678287884018e-05, "loss": 0.4523, "step": 1545 }, { "epoch": 1.440820130475303, "grad_norm": 0.15629145583351522, "learning_rate": 1.154987918536417e-05, "loss": 0.456, "step": 1546 }, { "epoch": 1.4417520969245108, "grad_norm": 0.15993060730551129, "learning_rate": 1.1542975491888163e-05, "loss": 0.4631, "step": 1547 }, { "epoch": 1.4426840633737186, "grad_norm": 0.15269235937236317, "learning_rate": 1.1536071798412152e-05, "loss": 0.4548, "step": 1548 }, { "epoch": 1.4436160298229264, "grad_norm": 0.1543799206005378, "learning_rate": 1.152916810493614e-05, "loss": 0.4621, "step": 1549 }, { "epoch": 1.4445479962721341, "grad_norm": 0.15509288449699496, "learning_rate": 1.1522264411460132e-05, "loss": 0.467, "step": 1550 }, { "epoch": 1.445479962721342, "grad_norm": 0.15409967155551982, "learning_rate": 1.1515360717984124e-05, "loss": 0.4661, "step": 1551 }, { "epoch": 1.4464119291705497, "grad_norm": 0.14397401280201125, "learning_rate": 1.1508457024508113e-05, "loss": 0.4613, "step": 1552 }, { "epoch": 1.4473438956197577, "grad_norm": 0.15684216322493894, "learning_rate": 1.1501553331032102e-05, "loss": 0.4454, "step": 1553 }, { "epoch": 1.4482758620689655, "grad_norm": 0.150852597286939, "learning_rate": 1.1494649637556094e-05, "loss": 0.4598, "step": 1554 }, { "epoch": 1.4492078285181733, "grad_norm": 0.16659107298929227, "learning_rate": 1.1487745944080084e-05, "loss": 0.4769, "step": 1555 }, { "epoch": 1.450139794967381, "grad_norm": 0.1496848750286405, "learning_rate": 1.1480842250604074e-05, "loss": 0.4702, "step": 1556 }, { "epoch": 1.4510717614165891, "grad_norm": 0.1561784205567715, "learning_rate": 1.1473938557128063e-05, "loss": 0.4548, "step": 1557 }, { "epoch": 1.452003727865797, "grad_norm": 0.15069394601730624, "learning_rate": 1.1467034863652056e-05, "loss": 0.4486, "step": 1558 }, { "epoch": 1.4529356943150047, "grad_norm": 0.1549345424655382, "learning_rate": 1.1460131170176045e-05, "loss": 0.4516, "step": 1559 }, { "epoch": 1.4538676607642125, "grad_norm": 0.15283815941119228, "learning_rate": 1.1453227476700035e-05, "loss": 0.4742, "step": 1560 }, { "epoch": 1.4547996272134203, "grad_norm": 0.1515856450451969, "learning_rate": 1.1446323783224025e-05, "loss": 0.4514, "step": 1561 }, { "epoch": 1.455731593662628, "grad_norm": 0.16701003817713855, "learning_rate": 1.1439420089748017e-05, "loss": 0.4762, "step": 1562 }, { "epoch": 1.4566635601118358, "grad_norm": 0.14989137751233814, "learning_rate": 1.1432516396272006e-05, "loss": 0.4599, "step": 1563 }, { "epoch": 1.4575955265610439, "grad_norm": 0.15739298867657747, "learning_rate": 1.1425612702795997e-05, "loss": 0.4722, "step": 1564 }, { "epoch": 1.4585274930102516, "grad_norm": 0.15200754707492883, "learning_rate": 1.1418709009319987e-05, "loss": 0.4511, "step": 1565 }, { "epoch": 1.4594594594594594, "grad_norm": 0.15764365770012762, "learning_rate": 1.1411805315843978e-05, "loss": 0.5026, "step": 1566 }, { "epoch": 1.4603914259086672, "grad_norm": 0.14638846710810546, "learning_rate": 1.1404901622367967e-05, "loss": 0.4369, "step": 1567 }, { "epoch": 1.4613233923578752, "grad_norm": 0.15500427190312882, "learning_rate": 1.1397997928891958e-05, "loss": 0.471, "step": 1568 }, { "epoch": 1.462255358807083, "grad_norm": 0.14812744593878588, "learning_rate": 1.1391094235415949e-05, "loss": 0.4763, "step": 1569 }, { "epoch": 1.4631873252562908, "grad_norm": 0.14513403561985433, "learning_rate": 1.138419054193994e-05, "loss": 0.4431, "step": 1570 }, { "epoch": 1.4641192917054986, "grad_norm": 0.16951849440058409, "learning_rate": 1.1377286848463928e-05, "loss": 0.4763, "step": 1571 }, { "epoch": 1.4650512581547064, "grad_norm": 0.14362042430248892, "learning_rate": 1.137038315498792e-05, "loss": 0.468, "step": 1572 }, { "epoch": 1.4659832246039142, "grad_norm": 0.15342122797629884, "learning_rate": 1.136347946151191e-05, "loss": 0.4586, "step": 1573 }, { "epoch": 1.466915191053122, "grad_norm": 0.16674759217342824, "learning_rate": 1.13565757680359e-05, "loss": 0.4884, "step": 1574 }, { "epoch": 1.46784715750233, "grad_norm": 0.15036656397222636, "learning_rate": 1.134967207455989e-05, "loss": 0.4727, "step": 1575 }, { "epoch": 1.4687791239515378, "grad_norm": 0.14402788724109664, "learning_rate": 1.1342768381083882e-05, "loss": 0.4478, "step": 1576 }, { "epoch": 1.4697110904007455, "grad_norm": 0.14643616435554385, "learning_rate": 1.1335864687607871e-05, "loss": 0.4632, "step": 1577 }, { "epoch": 1.4706430568499533, "grad_norm": 0.15933894293223907, "learning_rate": 1.1328960994131862e-05, "loss": 0.4791, "step": 1578 }, { "epoch": 1.4715750232991613, "grad_norm": 0.16510557806302992, "learning_rate": 1.1322057300655851e-05, "loss": 0.4811, "step": 1579 }, { "epoch": 1.4725069897483691, "grad_norm": 0.1495155144993771, "learning_rate": 1.1315153607179843e-05, "loss": 0.4723, "step": 1580 }, { "epoch": 1.473438956197577, "grad_norm": 0.15884341370262575, "learning_rate": 1.1308249913703832e-05, "loss": 0.4713, "step": 1581 }, { "epoch": 1.4743709226467847, "grad_norm": 0.15079194589866823, "learning_rate": 1.1301346220227823e-05, "loss": 0.449, "step": 1582 }, { "epoch": 1.4753028890959925, "grad_norm": 0.1412051972858413, "learning_rate": 1.1294442526751812e-05, "loss": 0.4573, "step": 1583 }, { "epoch": 1.4762348555452003, "grad_norm": 0.16921093075918656, "learning_rate": 1.1287538833275805e-05, "loss": 0.4776, "step": 1584 }, { "epoch": 1.477166821994408, "grad_norm": 0.1544210677609436, "learning_rate": 1.1280635139799794e-05, "loss": 0.4844, "step": 1585 }, { "epoch": 1.478098788443616, "grad_norm": 0.1553167873224647, "learning_rate": 1.1273731446323783e-05, "loss": 0.4946, "step": 1586 }, { "epoch": 1.4790307548928239, "grad_norm": 0.16089972830958119, "learning_rate": 1.1266827752847775e-05, "loss": 0.4595, "step": 1587 }, { "epoch": 1.4799627213420317, "grad_norm": 0.15557696834108067, "learning_rate": 1.1259924059371766e-05, "loss": 0.4606, "step": 1588 }, { "epoch": 1.4808946877912395, "grad_norm": 0.15886004253643127, "learning_rate": 1.1253020365895755e-05, "loss": 0.5074, "step": 1589 }, { "epoch": 1.4818266542404475, "grad_norm": 0.1510025387741643, "learning_rate": 1.1246116672419744e-05, "loss": 0.4668, "step": 1590 }, { "epoch": 1.4827586206896552, "grad_norm": 0.17053997750827685, "learning_rate": 1.1239212978943736e-05, "loss": 0.4915, "step": 1591 }, { "epoch": 1.483690587138863, "grad_norm": 0.167953019216876, "learning_rate": 1.1232309285467727e-05, "loss": 0.4598, "step": 1592 }, { "epoch": 1.4846225535880708, "grad_norm": 0.1573917263585204, "learning_rate": 1.1225405591991716e-05, "loss": 0.4898, "step": 1593 }, { "epoch": 1.4855545200372786, "grad_norm": 0.15377976600358179, "learning_rate": 1.1218501898515709e-05, "loss": 0.4492, "step": 1594 }, { "epoch": 1.4864864864864864, "grad_norm": 0.1571646082174891, "learning_rate": 1.1211598205039698e-05, "loss": 0.4568, "step": 1595 }, { "epoch": 1.4874184529356942, "grad_norm": 0.15399716143215506, "learning_rate": 1.1204694511563687e-05, "loss": 0.4703, "step": 1596 }, { "epoch": 1.4883504193849022, "grad_norm": 0.14666590944835683, "learning_rate": 1.1197790818087677e-05, "loss": 0.4591, "step": 1597 }, { "epoch": 1.48928238583411, "grad_norm": 0.16129203336020773, "learning_rate": 1.119088712461167e-05, "loss": 0.4652, "step": 1598 }, { "epoch": 1.4902143522833178, "grad_norm": 0.1470348547011586, "learning_rate": 1.1183983431135659e-05, "loss": 0.4668, "step": 1599 }, { "epoch": 1.4911463187325256, "grad_norm": 0.16218908392001616, "learning_rate": 1.1177079737659648e-05, "loss": 0.4794, "step": 1600 }, { "epoch": 1.4920782851817336, "grad_norm": 0.15963980647064083, "learning_rate": 1.1170176044183639e-05, "loss": 0.4909, "step": 1601 }, { "epoch": 1.4930102516309414, "grad_norm": 0.16030068786926036, "learning_rate": 1.1163272350707631e-05, "loss": 0.4693, "step": 1602 }, { "epoch": 1.4939422180801492, "grad_norm": 0.15180197906018306, "learning_rate": 1.115636865723162e-05, "loss": 0.4625, "step": 1603 }, { "epoch": 1.494874184529357, "grad_norm": 0.1538955071247006, "learning_rate": 1.1149464963755609e-05, "loss": 0.489, "step": 1604 }, { "epoch": 1.4958061509785647, "grad_norm": 0.16298173877911634, "learning_rate": 1.1142561270279602e-05, "loss": 0.4734, "step": 1605 }, { "epoch": 1.4967381174277725, "grad_norm": 0.16422049519810858, "learning_rate": 1.113565757680359e-05, "loss": 0.458, "step": 1606 }, { "epoch": 1.4976700838769803, "grad_norm": 0.165970808261398, "learning_rate": 1.1128753883327581e-05, "loss": 0.48, "step": 1607 }, { "epoch": 1.4986020503261883, "grad_norm": 0.15261492706422758, "learning_rate": 1.112185018985157e-05, "loss": 0.5002, "step": 1608 }, { "epoch": 1.499534016775396, "grad_norm": 0.14970568044110757, "learning_rate": 1.1114946496375563e-05, "loss": 0.4468, "step": 1609 }, { "epoch": 1.500465983224604, "grad_norm": 0.16399184197621372, "learning_rate": 1.1108042802899552e-05, "loss": 0.4761, "step": 1610 }, { "epoch": 1.501397949673812, "grad_norm": 0.14181954027765872, "learning_rate": 1.1101139109423543e-05, "loss": 0.4374, "step": 1611 }, { "epoch": 1.5023299161230197, "grad_norm": 0.16341003894077427, "learning_rate": 1.1094235415947532e-05, "loss": 0.4938, "step": 1612 }, { "epoch": 1.5032618825722275, "grad_norm": 0.153806924271998, "learning_rate": 1.1087331722471524e-05, "loss": 0.4558, "step": 1613 }, { "epoch": 1.5041938490214353, "grad_norm": 0.15731309847022729, "learning_rate": 1.1080428028995513e-05, "loss": 0.4785, "step": 1614 }, { "epoch": 1.505125815470643, "grad_norm": 0.16483103061731877, "learning_rate": 1.1073524335519504e-05, "loss": 0.4603, "step": 1615 }, { "epoch": 1.5060577819198508, "grad_norm": 0.16132960876604968, "learning_rate": 1.1066620642043494e-05, "loss": 0.4621, "step": 1616 }, { "epoch": 1.5069897483690586, "grad_norm": 0.1576476161597931, "learning_rate": 1.1059716948567485e-05, "loss": 0.4492, "step": 1617 }, { "epoch": 1.5079217148182664, "grad_norm": 0.15946043315830294, "learning_rate": 1.1052813255091474e-05, "loss": 0.4844, "step": 1618 }, { "epoch": 1.5088536812674742, "grad_norm": 0.16245948461727766, "learning_rate": 1.1045909561615465e-05, "loss": 0.4647, "step": 1619 }, { "epoch": 1.5097856477166822, "grad_norm": 0.16017001077366136, "learning_rate": 1.1039005868139456e-05, "loss": 0.4682, "step": 1620 }, { "epoch": 1.51071761416589, "grad_norm": 0.14140171867724033, "learning_rate": 1.1032102174663446e-05, "loss": 0.4399, "step": 1621 }, { "epoch": 1.511649580615098, "grad_norm": 0.14920043079145306, "learning_rate": 1.1025198481187435e-05, "loss": 0.4527, "step": 1622 }, { "epoch": 1.5125815470643058, "grad_norm": 0.1431579902742567, "learning_rate": 1.1018294787711426e-05, "loss": 0.4561, "step": 1623 }, { "epoch": 1.5135135135135136, "grad_norm": 0.17083198760509785, "learning_rate": 1.1011391094235417e-05, "loss": 0.4694, "step": 1624 }, { "epoch": 1.5144454799627214, "grad_norm": 0.1447632407603151, "learning_rate": 1.1004487400759408e-05, "loss": 0.4389, "step": 1625 }, { "epoch": 1.5153774464119292, "grad_norm": 0.14935132486151886, "learning_rate": 1.0997583707283397e-05, "loss": 0.4425, "step": 1626 }, { "epoch": 1.516309412861137, "grad_norm": 0.1498691234054971, "learning_rate": 1.0990680013807389e-05, "loss": 0.4451, "step": 1627 }, { "epoch": 1.5172413793103448, "grad_norm": 0.15108277902658127, "learning_rate": 1.0983776320331378e-05, "loss": 0.4464, "step": 1628 }, { "epoch": 1.5181733457595525, "grad_norm": 0.15130309920473076, "learning_rate": 1.0976872626855369e-05, "loss": 0.458, "step": 1629 }, { "epoch": 1.5191053122087603, "grad_norm": 0.17974945993690497, "learning_rate": 1.0969968933379358e-05, "loss": 0.4776, "step": 1630 }, { "epoch": 1.5200372786579683, "grad_norm": 0.15372872272436394, "learning_rate": 1.096306523990335e-05, "loss": 0.4475, "step": 1631 }, { "epoch": 1.5209692451071761, "grad_norm": 0.14805887945258484, "learning_rate": 1.095616154642734e-05, "loss": 0.4441, "step": 1632 }, { "epoch": 1.521901211556384, "grad_norm": 0.16234512780994706, "learning_rate": 1.094925785295133e-05, "loss": 0.4918, "step": 1633 }, { "epoch": 1.522833178005592, "grad_norm": 0.1536134800070843, "learning_rate": 1.094235415947532e-05, "loss": 0.4641, "step": 1634 }, { "epoch": 1.5237651444547997, "grad_norm": 0.1453983561045919, "learning_rate": 1.0935450465999312e-05, "loss": 0.4321, "step": 1635 }, { "epoch": 1.5246971109040075, "grad_norm": 0.16374100510722964, "learning_rate": 1.09285467725233e-05, "loss": 0.4714, "step": 1636 }, { "epoch": 1.5256290773532153, "grad_norm": 0.15432688286128096, "learning_rate": 1.092164307904729e-05, "loss": 0.4407, "step": 1637 }, { "epoch": 1.526561043802423, "grad_norm": 0.14342659640300306, "learning_rate": 1.0914739385571282e-05, "loss": 0.4516, "step": 1638 }, { "epoch": 1.5274930102516309, "grad_norm": 0.1630040690171116, "learning_rate": 1.0907835692095273e-05, "loss": 0.4754, "step": 1639 }, { "epoch": 1.5284249767008387, "grad_norm": 0.15921027911423724, "learning_rate": 1.0900931998619262e-05, "loss": 0.4659, "step": 1640 }, { "epoch": 1.5293569431500464, "grad_norm": 0.15833653238533238, "learning_rate": 1.0894028305143251e-05, "loss": 0.4821, "step": 1641 }, { "epoch": 1.5302889095992545, "grad_norm": 0.15475368473927026, "learning_rate": 1.0887124611667243e-05, "loss": 0.4694, "step": 1642 }, { "epoch": 1.5312208760484622, "grad_norm": 0.15658416655343577, "learning_rate": 1.0880220918191234e-05, "loss": 0.5027, "step": 1643 }, { "epoch": 1.53215284249767, "grad_norm": 0.15237865042506335, "learning_rate": 1.0873317224715223e-05, "loss": 0.4679, "step": 1644 }, { "epoch": 1.533084808946878, "grad_norm": 0.15434602560939015, "learning_rate": 1.0866413531239216e-05, "loss": 0.4665, "step": 1645 }, { "epoch": 1.5340167753960858, "grad_norm": 0.1559857320679807, "learning_rate": 1.0859509837763205e-05, "loss": 0.4893, "step": 1646 }, { "epoch": 1.5349487418452936, "grad_norm": 0.1469932434160377, "learning_rate": 1.0852606144287194e-05, "loss": 0.4572, "step": 1647 }, { "epoch": 1.5358807082945014, "grad_norm": 0.17108855158388112, "learning_rate": 1.0845702450811184e-05, "loss": 0.4682, "step": 1648 }, { "epoch": 1.5368126747437092, "grad_norm": 0.21589488438392052, "learning_rate": 1.0838798757335177e-05, "loss": 0.4757, "step": 1649 }, { "epoch": 1.537744641192917, "grad_norm": 0.1673663689651782, "learning_rate": 1.0831895063859166e-05, "loss": 0.4657, "step": 1650 }, { "epoch": 1.5386766076421248, "grad_norm": 0.1482293169556248, "learning_rate": 1.0824991370383155e-05, "loss": 0.482, "step": 1651 }, { "epoch": 1.5396085740913326, "grad_norm": 0.15857359832885437, "learning_rate": 1.0818087676907146e-05, "loss": 0.4778, "step": 1652 }, { "epoch": 1.5405405405405406, "grad_norm": 0.1479551873216002, "learning_rate": 1.0811183983431136e-05, "loss": 0.49, "step": 1653 }, { "epoch": 1.5414725069897484, "grad_norm": 0.14568007955209758, "learning_rate": 1.0804280289955127e-05, "loss": 0.4511, "step": 1654 }, { "epoch": 1.5424044734389561, "grad_norm": 0.14738396654197175, "learning_rate": 1.0797376596479116e-05, "loss": 0.4696, "step": 1655 }, { "epoch": 1.5433364398881642, "grad_norm": 0.15228966969763358, "learning_rate": 1.0790472903003109e-05, "loss": 0.4625, "step": 1656 }, { "epoch": 1.544268406337372, "grad_norm": 0.13977071517843218, "learning_rate": 1.0783569209527098e-05, "loss": 0.4485, "step": 1657 }, { "epoch": 1.5452003727865797, "grad_norm": 0.15405823963771845, "learning_rate": 1.0776665516051088e-05, "loss": 0.4703, "step": 1658 }, { "epoch": 1.5461323392357875, "grad_norm": 0.15866670642222383, "learning_rate": 1.0769761822575077e-05, "loss": 0.4699, "step": 1659 }, { "epoch": 1.5470643056849953, "grad_norm": 0.1574575494847432, "learning_rate": 1.076285812909907e-05, "loss": 0.4864, "step": 1660 }, { "epoch": 1.547996272134203, "grad_norm": 0.14475438830301932, "learning_rate": 1.0755954435623059e-05, "loss": 0.4593, "step": 1661 }, { "epoch": 1.5489282385834109, "grad_norm": 0.16087731681390816, "learning_rate": 1.074905074214705e-05, "loss": 0.4534, "step": 1662 }, { "epoch": 1.5498602050326187, "grad_norm": 0.16451860680921707, "learning_rate": 1.0742147048671039e-05, "loss": 0.4534, "step": 1663 }, { "epoch": 1.5507921714818267, "grad_norm": 0.15437973651167897, "learning_rate": 1.0735243355195031e-05, "loss": 0.4678, "step": 1664 }, { "epoch": 1.5517241379310345, "grad_norm": 0.1458137101787921, "learning_rate": 1.072833966171902e-05, "loss": 0.4456, "step": 1665 }, { "epoch": 1.5526561043802423, "grad_norm": 0.15604221018854755, "learning_rate": 1.072143596824301e-05, "loss": 0.4574, "step": 1666 }, { "epoch": 1.5535880708294503, "grad_norm": 0.14677686997931366, "learning_rate": 1.0714532274767002e-05, "loss": 0.473, "step": 1667 }, { "epoch": 1.554520037278658, "grad_norm": 0.14369862336895708, "learning_rate": 1.0707628581290992e-05, "loss": 0.4584, "step": 1668 }, { "epoch": 1.5554520037278659, "grad_norm": 0.15108251193283415, "learning_rate": 1.0700724887814981e-05, "loss": 0.4614, "step": 1669 }, { "epoch": 1.5563839701770736, "grad_norm": 0.14726401352668092, "learning_rate": 1.0693821194338972e-05, "loss": 0.4516, "step": 1670 }, { "epoch": 1.5573159366262814, "grad_norm": 0.1382271312020064, "learning_rate": 1.0686917500862963e-05, "loss": 0.4357, "step": 1671 }, { "epoch": 1.5582479030754892, "grad_norm": 0.15946425621571209, "learning_rate": 1.0680013807386953e-05, "loss": 0.4988, "step": 1672 }, { "epoch": 1.559179869524697, "grad_norm": 0.14729876705000802, "learning_rate": 1.0673110113910943e-05, "loss": 0.4874, "step": 1673 }, { "epoch": 1.5601118359739048, "grad_norm": 0.14090028411498692, "learning_rate": 1.0666206420434933e-05, "loss": 0.4503, "step": 1674 }, { "epoch": 1.5610438024231128, "grad_norm": 0.15318832630684504, "learning_rate": 1.0659302726958924e-05, "loss": 0.4909, "step": 1675 }, { "epoch": 1.5619757688723206, "grad_norm": 0.14740978095935714, "learning_rate": 1.0652399033482915e-05, "loss": 0.4588, "step": 1676 }, { "epoch": 1.5629077353215284, "grad_norm": 0.14268103055416023, "learning_rate": 1.0645495340006904e-05, "loss": 0.4496, "step": 1677 }, { "epoch": 1.5638397017707364, "grad_norm": 0.14698639453367546, "learning_rate": 1.0638591646530896e-05, "loss": 0.4587, "step": 1678 }, { "epoch": 1.5647716682199442, "grad_norm": 0.14466218115110585, "learning_rate": 1.0631687953054885e-05, "loss": 0.4435, "step": 1679 }, { "epoch": 1.565703634669152, "grad_norm": 0.15799544521290074, "learning_rate": 1.0624784259578876e-05, "loss": 0.488, "step": 1680 }, { "epoch": 1.5666356011183598, "grad_norm": 0.15043155315851323, "learning_rate": 1.0617880566102865e-05, "loss": 0.4523, "step": 1681 }, { "epoch": 1.5675675675675675, "grad_norm": 0.32319433520920304, "learning_rate": 1.0610976872626857e-05, "loss": 0.4408, "step": 1682 }, { "epoch": 1.5684995340167753, "grad_norm": 0.14031562999432376, "learning_rate": 1.0604073179150846e-05, "loss": 0.445, "step": 1683 }, { "epoch": 1.5694315004659831, "grad_norm": 0.16760744686485743, "learning_rate": 1.0597169485674837e-05, "loss": 0.473, "step": 1684 }, { "epoch": 1.570363466915191, "grad_norm": 0.15817309935718007, "learning_rate": 1.0590265792198828e-05, "loss": 0.4682, "step": 1685 }, { "epoch": 1.571295433364399, "grad_norm": 0.1689822716769008, "learning_rate": 1.0583362098722819e-05, "loss": 0.4595, "step": 1686 }, { "epoch": 1.5722273998136067, "grad_norm": 0.14405717643504087, "learning_rate": 1.0576458405246808e-05, "loss": 0.4613, "step": 1687 }, { "epoch": 1.5731593662628145, "grad_norm": 0.17088905953779204, "learning_rate": 1.0569554711770797e-05, "loss": 0.4853, "step": 1688 }, { "epoch": 1.5740913327120225, "grad_norm": 0.1468940841440026, "learning_rate": 1.056265101829479e-05, "loss": 0.4784, "step": 1689 }, { "epoch": 1.5750232991612303, "grad_norm": 0.15271870531913867, "learning_rate": 1.055574732481878e-05, "loss": 0.474, "step": 1690 }, { "epoch": 1.575955265610438, "grad_norm": 0.15104851935854377, "learning_rate": 1.0548843631342769e-05, "loss": 0.489, "step": 1691 }, { "epoch": 1.5768872320596459, "grad_norm": 0.14562129469111826, "learning_rate": 1.0541939937866758e-05, "loss": 0.4654, "step": 1692 }, { "epoch": 1.5778191985088537, "grad_norm": 0.1561208183787563, "learning_rate": 1.053503624439075e-05, "loss": 0.5008, "step": 1693 }, { "epoch": 1.5787511649580614, "grad_norm": 0.14566400143156763, "learning_rate": 1.0528132550914741e-05, "loss": 0.4474, "step": 1694 }, { "epoch": 1.5796831314072692, "grad_norm": 0.1559726266528745, "learning_rate": 1.052122885743873e-05, "loss": 0.4698, "step": 1695 }, { "epoch": 1.580615097856477, "grad_norm": 0.139986925879674, "learning_rate": 1.0514325163962723e-05, "loss": 0.4563, "step": 1696 }, { "epoch": 1.581547064305685, "grad_norm": 0.14332041007111584, "learning_rate": 1.0507421470486712e-05, "loss": 0.4697, "step": 1697 }, { "epoch": 1.5824790307548928, "grad_norm": 0.15552798359576778, "learning_rate": 1.05005177770107e-05, "loss": 0.478, "step": 1698 }, { "epoch": 1.5834109972041006, "grad_norm": 0.16608649386275517, "learning_rate": 1.0493614083534691e-05, "loss": 0.4401, "step": 1699 }, { "epoch": 1.5843429636533086, "grad_norm": 0.18108712654760709, "learning_rate": 1.0486710390058684e-05, "loss": 0.4854, "step": 1700 }, { "epoch": 1.5852749301025164, "grad_norm": 0.14942084935890504, "learning_rate": 1.0479806696582673e-05, "loss": 0.4556, "step": 1701 }, { "epoch": 1.5862068965517242, "grad_norm": 0.16660860315098497, "learning_rate": 1.0472903003106662e-05, "loss": 0.4689, "step": 1702 }, { "epoch": 1.587138863000932, "grad_norm": 0.1507708594801108, "learning_rate": 1.0465999309630653e-05, "loss": 0.4459, "step": 1703 }, { "epoch": 1.5880708294501398, "grad_norm": 0.1629399903845452, "learning_rate": 1.0459095616154643e-05, "loss": 0.5018, "step": 1704 }, { "epoch": 1.5890027958993476, "grad_norm": 0.1502445105308631, "learning_rate": 1.0452191922678634e-05, "loss": 0.4629, "step": 1705 }, { "epoch": 1.5899347623485554, "grad_norm": 0.1466765388433369, "learning_rate": 1.0445288229202623e-05, "loss": 0.4468, "step": 1706 }, { "epoch": 1.5908667287977631, "grad_norm": 0.1499142145493615, "learning_rate": 1.0438384535726616e-05, "loss": 0.4539, "step": 1707 }, { "epoch": 1.5917986952469712, "grad_norm": 0.1531315611196122, "learning_rate": 1.0431480842250605e-05, "loss": 0.4661, "step": 1708 }, { "epoch": 1.592730661696179, "grad_norm": 0.17898237492351796, "learning_rate": 1.0424577148774595e-05, "loss": 0.4809, "step": 1709 }, { "epoch": 1.5936626281453867, "grad_norm": 0.1537346501872952, "learning_rate": 1.0417673455298584e-05, "loss": 0.4781, "step": 1710 }, { "epoch": 1.5945945945945947, "grad_norm": 0.15288151003394007, "learning_rate": 1.0410769761822577e-05, "loss": 0.4566, "step": 1711 }, { "epoch": 1.5955265610438025, "grad_norm": 0.1488203371681716, "learning_rate": 1.0403866068346566e-05, "loss": 0.4703, "step": 1712 }, { "epoch": 1.5964585274930103, "grad_norm": 0.15110015666315224, "learning_rate": 1.0396962374870557e-05, "loss": 0.4742, "step": 1713 }, { "epoch": 1.597390493942218, "grad_norm": 0.15994903783005399, "learning_rate": 1.0390058681394546e-05, "loss": 0.4457, "step": 1714 }, { "epoch": 1.598322460391426, "grad_norm": 0.1551479257803198, "learning_rate": 1.0383154987918538e-05, "loss": 0.4551, "step": 1715 }, { "epoch": 1.5992544268406337, "grad_norm": 0.1543535175152498, "learning_rate": 1.0376251294442527e-05, "loss": 0.4687, "step": 1716 }, { "epoch": 1.6001863932898415, "grad_norm": 0.14669281435772505, "learning_rate": 1.0369347600966518e-05, "loss": 0.4688, "step": 1717 }, { "epoch": 1.6011183597390493, "grad_norm": 0.14743162090434467, "learning_rate": 1.0362443907490509e-05, "loss": 0.4482, "step": 1718 }, { "epoch": 1.6020503261882573, "grad_norm": 0.15354281661097532, "learning_rate": 1.03555402140145e-05, "loss": 0.4662, "step": 1719 }, { "epoch": 1.602982292637465, "grad_norm": 0.1558841128246635, "learning_rate": 1.0348636520538488e-05, "loss": 0.4884, "step": 1720 }, { "epoch": 1.6039142590866728, "grad_norm": 0.15747600963403438, "learning_rate": 1.0341732827062479e-05, "loss": 0.4877, "step": 1721 }, { "epoch": 1.6048462255358809, "grad_norm": 0.1552908762328549, "learning_rate": 1.033482913358647e-05, "loss": 0.4606, "step": 1722 }, { "epoch": 1.6057781919850886, "grad_norm": 0.15219621960197205, "learning_rate": 1.032792544011046e-05, "loss": 0.4647, "step": 1723 }, { "epoch": 1.6067101584342964, "grad_norm": 0.15098872065197538, "learning_rate": 1.032102174663445e-05, "loss": 0.4753, "step": 1724 }, { "epoch": 1.6076421248835042, "grad_norm": 0.15574444055247696, "learning_rate": 1.031411805315844e-05, "loss": 0.4377, "step": 1725 }, { "epoch": 1.608574091332712, "grad_norm": 0.149027860233739, "learning_rate": 1.0307214359682431e-05, "loss": 0.4417, "step": 1726 }, { "epoch": 1.6095060577819198, "grad_norm": 0.14496012940168543, "learning_rate": 1.0300310666206422e-05, "loss": 0.461, "step": 1727 }, { "epoch": 1.6104380242311276, "grad_norm": 0.16212322378210048, "learning_rate": 1.029340697273041e-05, "loss": 0.465, "step": 1728 }, { "epoch": 1.6113699906803354, "grad_norm": 0.15880354333815533, "learning_rate": 1.0286503279254403e-05, "loss": 0.4673, "step": 1729 }, { "epoch": 1.6123019571295434, "grad_norm": 0.1506544400340835, "learning_rate": 1.0279599585778392e-05, "loss": 0.4506, "step": 1730 }, { "epoch": 1.6132339235787512, "grad_norm": 0.151062124018934, "learning_rate": 1.0272695892302383e-05, "loss": 0.4675, "step": 1731 }, { "epoch": 1.614165890027959, "grad_norm": 0.14960409201988137, "learning_rate": 1.0265792198826372e-05, "loss": 0.49, "step": 1732 }, { "epoch": 1.615097856477167, "grad_norm": 0.1437518649807138, "learning_rate": 1.0258888505350364e-05, "loss": 0.4554, "step": 1733 }, { "epoch": 1.6160298229263748, "grad_norm": 0.14627670112626365, "learning_rate": 1.0251984811874353e-05, "loss": 0.4453, "step": 1734 }, { "epoch": 1.6169617893755825, "grad_norm": 0.15044094907823025, "learning_rate": 1.0245081118398344e-05, "loss": 0.4748, "step": 1735 }, { "epoch": 1.6178937558247903, "grad_norm": 0.1552089057752464, "learning_rate": 1.0238177424922335e-05, "loss": 0.4444, "step": 1736 }, { "epoch": 1.6188257222739981, "grad_norm": 0.15056567273565152, "learning_rate": 1.0231273731446326e-05, "loss": 0.4608, "step": 1737 }, { "epoch": 1.619757688723206, "grad_norm": 0.1499356078581539, "learning_rate": 1.0224370037970315e-05, "loss": 0.4562, "step": 1738 }, { "epoch": 1.6206896551724137, "grad_norm": 0.16520606413403824, "learning_rate": 1.0217466344494304e-05, "loss": 0.4569, "step": 1739 }, { "epoch": 1.6216216216216215, "grad_norm": 0.16207458666272143, "learning_rate": 1.0210562651018296e-05, "loss": 0.4985, "step": 1740 }, { "epoch": 1.6225535880708295, "grad_norm": 0.1592326474011908, "learning_rate": 1.0203658957542287e-05, "loss": 0.4788, "step": 1741 }, { "epoch": 1.6234855545200373, "grad_norm": 0.15943930156444397, "learning_rate": 1.0196755264066276e-05, "loss": 0.4863, "step": 1742 }, { "epoch": 1.624417520969245, "grad_norm": 0.14926017675179756, "learning_rate": 1.0189851570590265e-05, "loss": 0.4613, "step": 1743 }, { "epoch": 1.625349487418453, "grad_norm": 0.16511491755746524, "learning_rate": 1.0182947877114257e-05, "loss": 0.4836, "step": 1744 }, { "epoch": 1.6262814538676609, "grad_norm": 0.1554425017742698, "learning_rate": 1.0176044183638248e-05, "loss": 0.4561, "step": 1745 }, { "epoch": 1.6272134203168687, "grad_norm": 0.1532183268613238, "learning_rate": 1.0169140490162237e-05, "loss": 0.4832, "step": 1746 }, { "epoch": 1.6281453867660765, "grad_norm": 0.1413267184439827, "learning_rate": 1.016223679668623e-05, "loss": 0.4477, "step": 1747 }, { "epoch": 1.6290773532152842, "grad_norm": 0.14886841989020852, "learning_rate": 1.0155333103210219e-05, "loss": 0.4846, "step": 1748 }, { "epoch": 1.630009319664492, "grad_norm": 0.1355850485847852, "learning_rate": 1.0148429409734208e-05, "loss": 0.4443, "step": 1749 }, { "epoch": 1.6309412861136998, "grad_norm": 0.16783996275341384, "learning_rate": 1.0141525716258198e-05, "loss": 0.4797, "step": 1750 }, { "epoch": 1.6318732525629076, "grad_norm": 0.15110941621328874, "learning_rate": 1.0134622022782191e-05, "loss": 0.4543, "step": 1751 }, { "epoch": 1.6328052190121156, "grad_norm": 0.14362793816920877, "learning_rate": 1.012771832930618e-05, "loss": 0.4643, "step": 1752 }, { "epoch": 1.6337371854613234, "grad_norm": 0.15329905913530759, "learning_rate": 1.0120814635830169e-05, "loss": 0.4726, "step": 1753 }, { "epoch": 1.6346691519105312, "grad_norm": 0.14412726662132713, "learning_rate": 1.011391094235416e-05, "loss": 0.4463, "step": 1754 }, { "epoch": 1.6356011183597392, "grad_norm": 0.16316887569099076, "learning_rate": 1.010700724887815e-05, "loss": 0.4666, "step": 1755 }, { "epoch": 1.636533084808947, "grad_norm": 0.20557175547701464, "learning_rate": 1.0100103555402141e-05, "loss": 0.4739, "step": 1756 }, { "epoch": 1.6374650512581548, "grad_norm": 0.14764296664882748, "learning_rate": 1.009319986192613e-05, "loss": 0.4681, "step": 1757 }, { "epoch": 1.6383970177073626, "grad_norm": 0.14652830310124274, "learning_rate": 1.0086296168450123e-05, "loss": 0.4604, "step": 1758 }, { "epoch": 1.6393289841565704, "grad_norm": 0.15046359993841127, "learning_rate": 1.0079392474974112e-05, "loss": 0.4652, "step": 1759 }, { "epoch": 1.6402609506057781, "grad_norm": 0.17029765093396967, "learning_rate": 1.0072488781498102e-05, "loss": 0.4792, "step": 1760 }, { "epoch": 1.641192917054986, "grad_norm": 0.1903444941173408, "learning_rate": 1.0065585088022091e-05, "loss": 0.4754, "step": 1761 }, { "epoch": 1.6421248835041937, "grad_norm": 0.1369689195386739, "learning_rate": 1.0058681394546084e-05, "loss": 0.4438, "step": 1762 }, { "epoch": 1.6430568499534017, "grad_norm": 0.15381609400737245, "learning_rate": 1.0051777701070073e-05, "loss": 0.4765, "step": 1763 }, { "epoch": 1.6439888164026095, "grad_norm": 0.15226305448213065, "learning_rate": 1.0044874007594064e-05, "loss": 0.4862, "step": 1764 }, { "epoch": 1.6449207828518173, "grad_norm": 0.14793220986209502, "learning_rate": 1.0037970314118053e-05, "loss": 0.4617, "step": 1765 }, { "epoch": 1.6458527493010253, "grad_norm": 0.1451457182229337, "learning_rate": 1.0031066620642045e-05, "loss": 0.4573, "step": 1766 }, { "epoch": 1.646784715750233, "grad_norm": 0.13762064527412585, "learning_rate": 1.0024162927166034e-05, "loss": 0.4568, "step": 1767 }, { "epoch": 1.647716682199441, "grad_norm": 0.14896428975690393, "learning_rate": 1.0017259233690025e-05, "loss": 0.4416, "step": 1768 }, { "epoch": 1.6486486486486487, "grad_norm": 0.14757621355312453, "learning_rate": 1.0010355540214016e-05, "loss": 0.4702, "step": 1769 }, { "epoch": 1.6495806150978565, "grad_norm": 0.15537116407955404, "learning_rate": 1.0003451846738006e-05, "loss": 0.479, "step": 1770 }, { "epoch": 1.6505125815470643, "grad_norm": 0.14096783899809606, "learning_rate": 9.996548153261995e-06, "loss": 0.4384, "step": 1771 }, { "epoch": 1.651444547996272, "grad_norm": 0.14417622761917231, "learning_rate": 9.989644459785986e-06, "loss": 0.4443, "step": 1772 }, { "epoch": 1.6523765144454798, "grad_norm": 0.15291903111936592, "learning_rate": 9.982740766309977e-06, "loss": 0.4863, "step": 1773 }, { "epoch": 1.6533084808946876, "grad_norm": 0.13817506185955397, "learning_rate": 9.975837072833968e-06, "loss": 0.4445, "step": 1774 }, { "epoch": 1.6542404473438956, "grad_norm": 0.14942472370173668, "learning_rate": 9.968933379357957e-06, "loss": 0.4386, "step": 1775 }, { "epoch": 1.6551724137931034, "grad_norm": 0.15497702647971925, "learning_rate": 9.962029685881947e-06, "loss": 0.4765, "step": 1776 }, { "epoch": 1.6561043802423114, "grad_norm": 0.158412563939003, "learning_rate": 9.955125992405938e-06, "loss": 0.4703, "step": 1777 }, { "epoch": 1.6570363466915192, "grad_norm": 0.15033567815296223, "learning_rate": 9.948222298929929e-06, "loss": 0.4664, "step": 1778 }, { "epoch": 1.657968313140727, "grad_norm": 0.15723571042579432, "learning_rate": 9.94131860545392e-06, "loss": 0.4988, "step": 1779 }, { "epoch": 1.6589002795899348, "grad_norm": 0.14887904029173354, "learning_rate": 9.934414911977909e-06, "loss": 0.4429, "step": 1780 }, { "epoch": 1.6598322460391426, "grad_norm": 0.14635853208301408, "learning_rate": 9.9275112185019e-06, "loss": 0.4739, "step": 1781 }, { "epoch": 1.6607642124883504, "grad_norm": 0.1430290632497785, "learning_rate": 9.92060752502589e-06, "loss": 0.4784, "step": 1782 }, { "epoch": 1.6616961789375582, "grad_norm": 0.14308341386625792, "learning_rate": 9.91370383154988e-06, "loss": 0.4528, "step": 1783 }, { "epoch": 1.662628145386766, "grad_norm": 0.14956294055356809, "learning_rate": 9.90680013807387e-06, "loss": 0.4502, "step": 1784 }, { "epoch": 1.6635601118359737, "grad_norm": 0.1594481789627023, "learning_rate": 9.89989644459786e-06, "loss": 0.4676, "step": 1785 }, { "epoch": 1.6644920782851818, "grad_norm": 0.15582306120168926, "learning_rate": 9.892992751121851e-06, "loss": 0.4642, "step": 1786 }, { "epoch": 1.6654240447343895, "grad_norm": 0.1525599251799859, "learning_rate": 9.886089057645842e-06, "loss": 0.4494, "step": 1787 }, { "epoch": 1.6663560111835976, "grad_norm": 0.16307137544755682, "learning_rate": 9.879185364169833e-06, "loss": 0.4699, "step": 1788 }, { "epoch": 1.6672879776328053, "grad_norm": 0.15579749435801818, "learning_rate": 9.872281670693822e-06, "loss": 0.4754, "step": 1789 }, { "epoch": 1.6682199440820131, "grad_norm": 0.14646441679116848, "learning_rate": 9.865377977217812e-06, "loss": 0.4641, "step": 1790 }, { "epoch": 1.669151910531221, "grad_norm": 0.14024291333334563, "learning_rate": 9.858474283741803e-06, "loss": 0.4538, "step": 1791 }, { "epoch": 1.6700838769804287, "grad_norm": 0.15374569129104598, "learning_rate": 9.851570590265794e-06, "loss": 0.4593, "step": 1792 }, { "epoch": 1.6710158434296365, "grad_norm": 0.1571697869006609, "learning_rate": 9.844666896789783e-06, "loss": 0.4744, "step": 1793 }, { "epoch": 1.6719478098788443, "grad_norm": 0.13793289928497712, "learning_rate": 9.837763203313774e-06, "loss": 0.4312, "step": 1794 }, { "epoch": 1.672879776328052, "grad_norm": 0.1588493869921497, "learning_rate": 9.830859509837763e-06, "loss": 0.4536, "step": 1795 }, { "epoch": 1.6738117427772599, "grad_norm": 0.15698473658058437, "learning_rate": 9.823955816361753e-06, "loss": 0.4727, "step": 1796 }, { "epoch": 1.6747437092264679, "grad_norm": 0.13722106887198554, "learning_rate": 9.817052122885746e-06, "loss": 0.4315, "step": 1797 }, { "epoch": 1.6756756756756757, "grad_norm": 0.1710468902634982, "learning_rate": 9.810148429409735e-06, "loss": 0.4937, "step": 1798 }, { "epoch": 1.6766076421248837, "grad_norm": 0.14203376054716257, "learning_rate": 9.803244735933726e-06, "loss": 0.4138, "step": 1799 }, { "epoch": 1.6775396085740915, "grad_norm": 0.1545792932994176, "learning_rate": 9.796341042457715e-06, "loss": 0.468, "step": 1800 }, { "epoch": 1.6784715750232992, "grad_norm": 0.15994193332247045, "learning_rate": 9.789437348981705e-06, "loss": 0.4729, "step": 1801 }, { "epoch": 1.679403541472507, "grad_norm": 0.15142009272211274, "learning_rate": 9.782533655505696e-06, "loss": 0.452, "step": 1802 }, { "epoch": 1.6803355079217148, "grad_norm": 0.18407951125030167, "learning_rate": 9.775629962029687e-06, "loss": 0.4941, "step": 1803 }, { "epoch": 1.6812674743709226, "grad_norm": 0.15786059485194542, "learning_rate": 9.768726268553676e-06, "loss": 0.433, "step": 1804 }, { "epoch": 1.6821994408201304, "grad_norm": 0.15339003748652738, "learning_rate": 9.761822575077667e-06, "loss": 0.4372, "step": 1805 }, { "epoch": 1.6831314072693382, "grad_norm": 0.15464091525480592, "learning_rate": 9.754918881601657e-06, "loss": 0.4432, "step": 1806 }, { "epoch": 1.684063373718546, "grad_norm": 0.16387834939662554, "learning_rate": 9.748015188125648e-06, "loss": 0.4609, "step": 1807 }, { "epoch": 1.684995340167754, "grad_norm": 0.16617514950742515, "learning_rate": 9.741111494649639e-06, "loss": 0.4901, "step": 1808 }, { "epoch": 1.6859273066169618, "grad_norm": 0.14963114289187934, "learning_rate": 9.734207801173628e-06, "loss": 0.453, "step": 1809 }, { "epoch": 1.6868592730661698, "grad_norm": 0.1539023605024061, "learning_rate": 9.727304107697619e-06, "loss": 0.4564, "step": 1810 }, { "epoch": 1.6877912395153776, "grad_norm": 0.17879081674146186, "learning_rate": 9.72040041422161e-06, "loss": 0.4769, "step": 1811 }, { "epoch": 1.6887232059645854, "grad_norm": 0.14442352728836177, "learning_rate": 9.7134967207456e-06, "loss": 0.4229, "step": 1812 }, { "epoch": 1.6896551724137931, "grad_norm": 0.1526829926621354, "learning_rate": 9.70659302726959e-06, "loss": 0.4511, "step": 1813 }, { "epoch": 1.690587138863001, "grad_norm": 0.16273737659227966, "learning_rate": 9.69968933379358e-06, "loss": 0.4458, "step": 1814 }, { "epoch": 1.6915191053122087, "grad_norm": 0.1522380147645675, "learning_rate": 9.69278564031757e-06, "loss": 0.4464, "step": 1815 }, { "epoch": 1.6924510717614165, "grad_norm": 0.16365479022215568, "learning_rate": 9.685881946841561e-06, "loss": 0.461, "step": 1816 }, { "epoch": 1.6933830382106243, "grad_norm": 0.15476877196014108, "learning_rate": 9.678978253365552e-06, "loss": 0.4647, "step": 1817 }, { "epoch": 1.694315004659832, "grad_norm": 0.16321453514858283, "learning_rate": 9.672074559889541e-06, "loss": 0.4752, "step": 1818 }, { "epoch": 1.69524697110904, "grad_norm": 0.1558518419216084, "learning_rate": 9.665170866413532e-06, "loss": 0.4408, "step": 1819 }, { "epoch": 1.696178937558248, "grad_norm": 0.14290331832696476, "learning_rate": 9.658267172937523e-06, "loss": 0.437, "step": 1820 }, { "epoch": 1.6971109040074557, "grad_norm": 0.15384698438237357, "learning_rate": 9.651363479461513e-06, "loss": 0.4623, "step": 1821 }, { "epoch": 1.6980428704566637, "grad_norm": 0.15138868232748293, "learning_rate": 9.644459785985502e-06, "loss": 0.4542, "step": 1822 }, { "epoch": 1.6989748369058715, "grad_norm": 0.17132718658168639, "learning_rate": 9.637556092509493e-06, "loss": 0.4948, "step": 1823 }, { "epoch": 1.6999068033550793, "grad_norm": 0.16247943026700865, "learning_rate": 9.630652399033484e-06, "loss": 0.4681, "step": 1824 }, { "epoch": 1.700838769804287, "grad_norm": 0.1637239725872859, "learning_rate": 9.623748705557475e-06, "loss": 0.4657, "step": 1825 }, { "epoch": 1.7017707362534948, "grad_norm": 0.144997989508278, "learning_rate": 9.616845012081464e-06, "loss": 0.4566, "step": 1826 }, { "epoch": 1.7027027027027026, "grad_norm": 0.16102490576645162, "learning_rate": 9.609941318605454e-06, "loss": 0.5021, "step": 1827 }, { "epoch": 1.7036346691519104, "grad_norm": 0.1458870087031052, "learning_rate": 9.603037625129445e-06, "loss": 0.4447, "step": 1828 }, { "epoch": 1.7045666356011182, "grad_norm": 0.15690534284496827, "learning_rate": 9.596133931653436e-06, "loss": 0.4657, "step": 1829 }, { "epoch": 1.7054986020503262, "grad_norm": 0.14988342112487488, "learning_rate": 9.589230238177427e-06, "loss": 0.4643, "step": 1830 }, { "epoch": 1.706430568499534, "grad_norm": 0.15293800699395008, "learning_rate": 9.582326544701416e-06, "loss": 0.4509, "step": 1831 }, { "epoch": 1.7073625349487418, "grad_norm": 0.15244644787498166, "learning_rate": 9.575422851225406e-06, "loss": 0.4669, "step": 1832 }, { "epoch": 1.7082945013979498, "grad_norm": 0.16706663852807885, "learning_rate": 9.568519157749397e-06, "loss": 0.4801, "step": 1833 }, { "epoch": 1.7092264678471576, "grad_norm": 0.1441588418489192, "learning_rate": 9.561615464273388e-06, "loss": 0.4672, "step": 1834 }, { "epoch": 1.7101584342963654, "grad_norm": 0.16868168815896958, "learning_rate": 9.554711770797377e-06, "loss": 0.4741, "step": 1835 }, { "epoch": 1.7110904007455732, "grad_norm": 0.1490256164198227, "learning_rate": 9.547808077321368e-06, "loss": 0.4634, "step": 1836 }, { "epoch": 1.712022367194781, "grad_norm": 0.15957242818284645, "learning_rate": 9.540904383845358e-06, "loss": 0.486, "step": 1837 }, { "epoch": 1.7129543336439887, "grad_norm": 0.14770962437141316, "learning_rate": 9.534000690369349e-06, "loss": 0.4187, "step": 1838 }, { "epoch": 1.7138863000931965, "grad_norm": 0.14527329710652742, "learning_rate": 9.52709699689334e-06, "loss": 0.4562, "step": 1839 }, { "epoch": 1.7148182665424043, "grad_norm": 0.1566713627198633, "learning_rate": 9.520193303417329e-06, "loss": 0.4757, "step": 1840 }, { "epoch": 1.7157502329916123, "grad_norm": 0.1460923847045655, "learning_rate": 9.51328960994132e-06, "loss": 0.4733, "step": 1841 }, { "epoch": 1.7166821994408201, "grad_norm": 0.20129904905034832, "learning_rate": 9.50638591646531e-06, "loss": 0.4622, "step": 1842 }, { "epoch": 1.717614165890028, "grad_norm": 0.14537337509584952, "learning_rate": 9.499482222989301e-06, "loss": 0.4694, "step": 1843 }, { "epoch": 1.718546132339236, "grad_norm": 0.15412175878647275, "learning_rate": 9.49257852951329e-06, "loss": 0.4563, "step": 1844 }, { "epoch": 1.7194780987884437, "grad_norm": 0.1380587890759501, "learning_rate": 9.48567483603728e-06, "loss": 0.4307, "step": 1845 }, { "epoch": 1.7204100652376515, "grad_norm": 0.14519909974561163, "learning_rate": 9.47877114256127e-06, "loss": 0.4529, "step": 1846 }, { "epoch": 1.7213420316868593, "grad_norm": 0.15383663344102996, "learning_rate": 9.47186744908526e-06, "loss": 0.4529, "step": 1847 }, { "epoch": 1.722273998136067, "grad_norm": 0.1345306772515371, "learning_rate": 9.464963755609253e-06, "loss": 0.4047, "step": 1848 }, { "epoch": 1.7232059645852749, "grad_norm": 0.1434026074184597, "learning_rate": 9.458060062133242e-06, "loss": 0.457, "step": 1849 }, { "epoch": 1.7241379310344827, "grad_norm": 0.14747517593449244, "learning_rate": 9.451156368657233e-06, "loss": 0.4582, "step": 1850 }, { "epoch": 1.7250698974836904, "grad_norm": 0.14779044521599705, "learning_rate": 9.444252675181222e-06, "loss": 0.4463, "step": 1851 }, { "epoch": 1.7260018639328985, "grad_norm": 0.18102161448234677, "learning_rate": 9.437348981705212e-06, "loss": 0.4735, "step": 1852 }, { "epoch": 1.7269338303821062, "grad_norm": 0.1463753198142361, "learning_rate": 9.430445288229203e-06, "loss": 0.4516, "step": 1853 }, { "epoch": 1.727865796831314, "grad_norm": 0.14538239222942584, "learning_rate": 9.423541594753194e-06, "loss": 0.4434, "step": 1854 }, { "epoch": 1.728797763280522, "grad_norm": 0.1495205601594995, "learning_rate": 9.416637901277183e-06, "loss": 0.4572, "step": 1855 }, { "epoch": 1.7297297297297298, "grad_norm": 0.15950789598820764, "learning_rate": 9.409734207801174e-06, "loss": 0.4644, "step": 1856 }, { "epoch": 1.7306616961789376, "grad_norm": 0.14935117128015035, "learning_rate": 9.402830514325164e-06, "loss": 0.4874, "step": 1857 }, { "epoch": 1.7315936626281454, "grad_norm": 0.14051840963727316, "learning_rate": 9.395926820849155e-06, "loss": 0.4461, "step": 1858 }, { "epoch": 1.7325256290773532, "grad_norm": 0.14248196498600338, "learning_rate": 9.389023127373146e-06, "loss": 0.4558, "step": 1859 }, { "epoch": 1.733457595526561, "grad_norm": 0.15228799384102304, "learning_rate": 9.382119433897135e-06, "loss": 0.4397, "step": 1860 }, { "epoch": 1.7343895619757688, "grad_norm": 0.14723985420429786, "learning_rate": 9.375215740421126e-06, "loss": 0.4491, "step": 1861 }, { "epoch": 1.7353215284249766, "grad_norm": 0.14839075091970028, "learning_rate": 9.368312046945116e-06, "loss": 0.4427, "step": 1862 }, { "epoch": 1.7362534948741846, "grad_norm": 0.15616874672439057, "learning_rate": 9.361408353469107e-06, "loss": 0.4712, "step": 1863 }, { "epoch": 1.7371854613233924, "grad_norm": 0.14999594868500032, "learning_rate": 9.354504659993096e-06, "loss": 0.476, "step": 1864 }, { "epoch": 1.7381174277726001, "grad_norm": 0.14829974378494107, "learning_rate": 9.347600966517087e-06, "loss": 0.4657, "step": 1865 }, { "epoch": 1.7390493942218082, "grad_norm": 0.1522041622490136, "learning_rate": 9.340697273041078e-06, "loss": 0.4507, "step": 1866 }, { "epoch": 1.739981360671016, "grad_norm": 0.15000822509128967, "learning_rate": 9.333793579565068e-06, "loss": 0.4746, "step": 1867 }, { "epoch": 1.7409133271202237, "grad_norm": 0.13610213980477112, "learning_rate": 9.326889886089059e-06, "loss": 0.4492, "step": 1868 }, { "epoch": 1.7418452935694315, "grad_norm": 0.15277934739317287, "learning_rate": 9.319986192613048e-06, "loss": 0.4549, "step": 1869 }, { "epoch": 1.7427772600186393, "grad_norm": 0.1567221539539807, "learning_rate": 9.313082499137039e-06, "loss": 0.4558, "step": 1870 }, { "epoch": 1.743709226467847, "grad_norm": 0.14975192102839532, "learning_rate": 9.30617880566103e-06, "loss": 0.4733, "step": 1871 }, { "epoch": 1.7446411929170549, "grad_norm": 0.16253047183336838, "learning_rate": 9.29927511218502e-06, "loss": 0.4758, "step": 1872 }, { "epoch": 1.7455731593662627, "grad_norm": 0.15321591566784895, "learning_rate": 9.29237141870901e-06, "loss": 0.4754, "step": 1873 }, { "epoch": 1.7465051258154707, "grad_norm": 0.16210614034068696, "learning_rate": 9.285467725233e-06, "loss": 0.4459, "step": 1874 }, { "epoch": 1.7474370922646785, "grad_norm": 0.14971100769580412, "learning_rate": 9.278564031756991e-06, "loss": 0.4579, "step": 1875 }, { "epoch": 1.7483690587138863, "grad_norm": 0.1494367057343409, "learning_rate": 9.271660338280982e-06, "loss": 0.479, "step": 1876 }, { "epoch": 1.7493010251630943, "grad_norm": 0.14578753258293264, "learning_rate": 9.264756644804972e-06, "loss": 0.4431, "step": 1877 }, { "epoch": 1.750232991612302, "grad_norm": 0.22964928131747753, "learning_rate": 9.257852951328961e-06, "loss": 0.4582, "step": 1878 }, { "epoch": 1.7511649580615098, "grad_norm": 0.16246360859575068, "learning_rate": 9.250949257852952e-06, "loss": 0.4738, "step": 1879 }, { "epoch": 1.7520969245107176, "grad_norm": 0.14240421737698386, "learning_rate": 9.244045564376943e-06, "loss": 0.4651, "step": 1880 }, { "epoch": 1.7530288909599254, "grad_norm": 0.14779437312040083, "learning_rate": 9.237141870900934e-06, "loss": 0.4729, "step": 1881 }, { "epoch": 1.7539608574091332, "grad_norm": 0.15114770626326976, "learning_rate": 9.230238177424923e-06, "loss": 0.4601, "step": 1882 }, { "epoch": 1.754892823858341, "grad_norm": 0.16455871728876895, "learning_rate": 9.223334483948913e-06, "loss": 0.4679, "step": 1883 }, { "epoch": 1.7558247903075488, "grad_norm": 0.160881255562342, "learning_rate": 9.216430790472904e-06, "loss": 0.4551, "step": 1884 }, { "epoch": 1.7567567567567568, "grad_norm": 0.15137504645502783, "learning_rate": 9.209527096996895e-06, "loss": 0.4615, "step": 1885 }, { "epoch": 1.7576887232059646, "grad_norm": 0.1557752839665583, "learning_rate": 9.202623403520884e-06, "loss": 0.4729, "step": 1886 }, { "epoch": 1.7586206896551724, "grad_norm": 0.15921832901702906, "learning_rate": 9.195719710044875e-06, "loss": 0.4666, "step": 1887 }, { "epoch": 1.7595526561043804, "grad_norm": 0.13776230776486378, "learning_rate": 9.188816016568865e-06, "loss": 0.4388, "step": 1888 }, { "epoch": 1.7604846225535882, "grad_norm": 0.14902919191713615, "learning_rate": 9.181912323092856e-06, "loss": 0.4465, "step": 1889 }, { "epoch": 1.761416589002796, "grad_norm": 0.17501614885871897, "learning_rate": 9.175008629616847e-06, "loss": 0.4685, "step": 1890 }, { "epoch": 1.7623485554520038, "grad_norm": 0.15698284944749563, "learning_rate": 9.168104936140836e-06, "loss": 0.4433, "step": 1891 }, { "epoch": 1.7632805219012115, "grad_norm": 0.14309414119683037, "learning_rate": 9.161201242664827e-06, "loss": 0.4638, "step": 1892 }, { "epoch": 1.7642124883504193, "grad_norm": 0.1500556955165316, "learning_rate": 9.154297549188816e-06, "loss": 0.4633, "step": 1893 }, { "epoch": 1.7651444547996271, "grad_norm": 0.14479520155808528, "learning_rate": 9.147393855712808e-06, "loss": 0.4607, "step": 1894 }, { "epoch": 1.766076421248835, "grad_norm": 0.15235105381645772, "learning_rate": 9.140490162236797e-06, "loss": 0.4851, "step": 1895 }, { "epoch": 1.767008387698043, "grad_norm": 0.16452076104558894, "learning_rate": 9.133586468760788e-06, "loss": 0.4615, "step": 1896 }, { "epoch": 1.7679403541472507, "grad_norm": 0.14653677200158807, "learning_rate": 9.126682775284779e-06, "loss": 0.4511, "step": 1897 }, { "epoch": 1.7688723205964585, "grad_norm": 0.1352823568465868, "learning_rate": 9.119779081808768e-06, "loss": 0.4463, "step": 1898 }, { "epoch": 1.7698042870456665, "grad_norm": 0.141971645250039, "learning_rate": 9.11287538833276e-06, "loss": 0.4545, "step": 1899 }, { "epoch": 1.7707362534948743, "grad_norm": 0.14758711032008995, "learning_rate": 9.105971694856749e-06, "loss": 0.4395, "step": 1900 }, { "epoch": 1.771668219944082, "grad_norm": 0.14182973403265478, "learning_rate": 9.09906800138074e-06, "loss": 0.4447, "step": 1901 }, { "epoch": 1.7726001863932899, "grad_norm": 0.1305751084808827, "learning_rate": 9.092164307904729e-06, "loss": 0.4391, "step": 1902 }, { "epoch": 1.7735321528424977, "grad_norm": 0.13535396400890745, "learning_rate": 9.08526061442872e-06, "loss": 0.4252, "step": 1903 }, { "epoch": 1.7744641192917054, "grad_norm": 0.14456262343278553, "learning_rate": 9.07835692095271e-06, "loss": 0.4744, "step": 1904 }, { "epoch": 1.7753960857409132, "grad_norm": 0.14094894892675863, "learning_rate": 9.071453227476701e-06, "loss": 0.4611, "step": 1905 }, { "epoch": 1.776328052190121, "grad_norm": 0.16318873070578088, "learning_rate": 9.06454953400069e-06, "loss": 0.4668, "step": 1906 }, { "epoch": 1.777260018639329, "grad_norm": 0.15374420954612342, "learning_rate": 9.05764584052468e-06, "loss": 0.4683, "step": 1907 }, { "epoch": 1.7781919850885368, "grad_norm": 0.14703206357750845, "learning_rate": 9.050742147048671e-06, "loss": 0.4586, "step": 1908 }, { "epoch": 1.7791239515377446, "grad_norm": 0.17031555912041546, "learning_rate": 9.043838453572662e-06, "loss": 0.4673, "step": 1909 }, { "epoch": 1.7800559179869526, "grad_norm": 0.15910411688417817, "learning_rate": 9.036934760096653e-06, "loss": 0.4683, "step": 1910 }, { "epoch": 1.7809878844361604, "grad_norm": 0.16654453794675933, "learning_rate": 9.030031066620642e-06, "loss": 0.463, "step": 1911 }, { "epoch": 1.7819198508853682, "grad_norm": 0.1383028693381574, "learning_rate": 9.023127373144633e-06, "loss": 0.426, "step": 1912 }, { "epoch": 1.782851817334576, "grad_norm": 0.13565990688095422, "learning_rate": 9.016223679668623e-06, "loss": 0.4511, "step": 1913 }, { "epoch": 1.7837837837837838, "grad_norm": 0.16048774050913506, "learning_rate": 9.009319986192614e-06, "loss": 0.4699, "step": 1914 }, { "epoch": 1.7847157502329916, "grad_norm": 0.15670457556760448, "learning_rate": 9.002416292716603e-06, "loss": 0.4303, "step": 1915 }, { "epoch": 1.7856477166821993, "grad_norm": 0.16444819562578886, "learning_rate": 8.995512599240594e-06, "loss": 0.4659, "step": 1916 }, { "epoch": 1.7865796831314071, "grad_norm": 0.15147160633743031, "learning_rate": 8.988608905764585e-06, "loss": 0.4748, "step": 1917 }, { "epoch": 1.7875116495806151, "grad_norm": 0.1483238393893089, "learning_rate": 8.981705212288575e-06, "loss": 0.4732, "step": 1918 }, { "epoch": 1.788443616029823, "grad_norm": 0.16159837404697538, "learning_rate": 8.974801518812566e-06, "loss": 0.4515, "step": 1919 }, { "epoch": 1.7893755824790307, "grad_norm": 0.16802539471300057, "learning_rate": 8.967897825336555e-06, "loss": 0.4801, "step": 1920 }, { "epoch": 1.7903075489282387, "grad_norm": 0.16997445696287852, "learning_rate": 8.960994131860546e-06, "loss": 0.481, "step": 1921 }, { "epoch": 1.7912395153774465, "grad_norm": 0.1636745957319958, "learning_rate": 8.954090438384537e-06, "loss": 0.46, "step": 1922 }, { "epoch": 1.7921714818266543, "grad_norm": 0.15446359313767527, "learning_rate": 8.947186744908527e-06, "loss": 0.4641, "step": 1923 }, { "epoch": 1.793103448275862, "grad_norm": 0.14646214036102004, "learning_rate": 8.940283051432516e-06, "loss": 0.4479, "step": 1924 }, { "epoch": 1.7940354147250699, "grad_norm": 0.143075061842842, "learning_rate": 8.933379357956507e-06, "loss": 0.4454, "step": 1925 }, { "epoch": 1.7949673811742777, "grad_norm": 0.1594901769807839, "learning_rate": 8.926475664480498e-06, "loss": 0.4426, "step": 1926 }, { "epoch": 1.7958993476234855, "grad_norm": 0.14728946895656234, "learning_rate": 8.919571971004489e-06, "loss": 0.4526, "step": 1927 }, { "epoch": 1.7968313140726933, "grad_norm": 0.15490491430974762, "learning_rate": 8.91266827752848e-06, "loss": 0.4636, "step": 1928 }, { "epoch": 1.7977632805219013, "grad_norm": 0.14646248452496635, "learning_rate": 8.905764584052468e-06, "loss": 0.4472, "step": 1929 }, { "epoch": 1.798695246971109, "grad_norm": 0.16373783237145467, "learning_rate": 8.898860890576459e-06, "loss": 0.4719, "step": 1930 }, { "epoch": 1.7996272134203168, "grad_norm": 0.15076118104096273, "learning_rate": 8.89195719710045e-06, "loss": 0.4647, "step": 1931 }, { "epoch": 1.8005591798695249, "grad_norm": 0.1434642721800948, "learning_rate": 8.88505350362444e-06, "loss": 0.4606, "step": 1932 }, { "epoch": 1.8014911463187326, "grad_norm": 0.15726415623646936, "learning_rate": 8.87814981014843e-06, "loss": 0.4691, "step": 1933 }, { "epoch": 1.8024231127679404, "grad_norm": 0.15077744135744522, "learning_rate": 8.87124611667242e-06, "loss": 0.4521, "step": 1934 }, { "epoch": 1.8033550792171482, "grad_norm": 0.14880710242486772, "learning_rate": 8.864342423196411e-06, "loss": 0.4613, "step": 1935 }, { "epoch": 1.804287045666356, "grad_norm": 0.15277218835697728, "learning_rate": 8.857438729720402e-06, "loss": 0.4611, "step": 1936 }, { "epoch": 1.8052190121155638, "grad_norm": 0.15041501670091234, "learning_rate": 8.850535036244391e-06, "loss": 0.4522, "step": 1937 }, { "epoch": 1.8061509785647716, "grad_norm": 0.13944598223025678, "learning_rate": 8.843631342768382e-06, "loss": 0.4286, "step": 1938 }, { "epoch": 1.8070829450139794, "grad_norm": 0.13622446289634316, "learning_rate": 8.836727649292372e-06, "loss": 0.4445, "step": 1939 }, { "epoch": 1.8080149114631874, "grad_norm": 0.14918203672098063, "learning_rate": 8.829823955816363e-06, "loss": 0.4484, "step": 1940 }, { "epoch": 1.8089468779123952, "grad_norm": 0.14674996892311507, "learning_rate": 8.822920262340354e-06, "loss": 0.4615, "step": 1941 }, { "epoch": 1.809878844361603, "grad_norm": 0.15593343818999558, "learning_rate": 8.816016568864343e-06, "loss": 0.4751, "step": 1942 }, { "epoch": 1.810810810810811, "grad_norm": 0.1461721731321111, "learning_rate": 8.809112875388334e-06, "loss": 0.4746, "step": 1943 }, { "epoch": 1.8117427772600188, "grad_norm": 0.14882122933856334, "learning_rate": 8.802209181912323e-06, "loss": 0.4701, "step": 1944 }, { "epoch": 1.8126747437092265, "grad_norm": 0.14872195555175863, "learning_rate": 8.795305488436315e-06, "loss": 0.4819, "step": 1945 }, { "epoch": 1.8136067101584343, "grad_norm": 0.1450278699655163, "learning_rate": 8.788401794960304e-06, "loss": 0.4477, "step": 1946 }, { "epoch": 1.8145386766076421, "grad_norm": 0.1440320057772454, "learning_rate": 8.781498101484295e-06, "loss": 0.4341, "step": 1947 }, { "epoch": 1.81547064305685, "grad_norm": 0.1462745133076523, "learning_rate": 8.774594408008286e-06, "loss": 0.4546, "step": 1948 }, { "epoch": 1.8164026095060577, "grad_norm": 0.1464924262328449, "learning_rate": 8.767690714532275e-06, "loss": 0.4438, "step": 1949 }, { "epoch": 1.8173345759552655, "grad_norm": 0.14593995535991003, "learning_rate": 8.760787021056267e-06, "loss": 0.4459, "step": 1950 }, { "epoch": 1.8182665424044733, "grad_norm": 0.163482893571316, "learning_rate": 8.753883327580256e-06, "loss": 0.4609, "step": 1951 }, { "epoch": 1.8191985088536813, "grad_norm": 0.14412287885979488, "learning_rate": 8.746979634104247e-06, "loss": 0.4401, "step": 1952 }, { "epoch": 1.820130475302889, "grad_norm": 0.15048559686685167, "learning_rate": 8.740075940628236e-06, "loss": 0.4565, "step": 1953 }, { "epoch": 1.821062441752097, "grad_norm": 0.1554851595176216, "learning_rate": 8.733172247152227e-06, "loss": 0.4477, "step": 1954 }, { "epoch": 1.8219944082013049, "grad_norm": 0.14055870818704347, "learning_rate": 8.726268553676217e-06, "loss": 0.4596, "step": 1955 }, { "epoch": 1.8229263746505127, "grad_norm": 0.1561918948065402, "learning_rate": 8.719364860200208e-06, "loss": 0.4358, "step": 1956 }, { "epoch": 1.8238583410997204, "grad_norm": 0.1433864994834988, "learning_rate": 8.712461166724197e-06, "loss": 0.4676, "step": 1957 }, { "epoch": 1.8247903075489282, "grad_norm": 0.14133184138708033, "learning_rate": 8.705557473248188e-06, "loss": 0.4406, "step": 1958 }, { "epoch": 1.825722273998136, "grad_norm": 0.21329636531407264, "learning_rate": 8.698653779772179e-06, "loss": 0.4682, "step": 1959 }, { "epoch": 1.8266542404473438, "grad_norm": 0.14237319910517698, "learning_rate": 8.69175008629617e-06, "loss": 0.4683, "step": 1960 }, { "epoch": 1.8275862068965516, "grad_norm": 0.15508856640983576, "learning_rate": 8.68484639282016e-06, "loss": 0.4585, "step": 1961 }, { "epoch": 1.8285181733457594, "grad_norm": 0.16138487395130416, "learning_rate": 8.677942699344149e-06, "loss": 0.4662, "step": 1962 }, { "epoch": 1.8294501397949674, "grad_norm": 0.15139538263606425, "learning_rate": 8.67103900586814e-06, "loss": 0.4536, "step": 1963 }, { "epoch": 1.8303821062441752, "grad_norm": 0.1749354009883275, "learning_rate": 8.66413531239213e-06, "loss": 0.4769, "step": 1964 }, { "epoch": 1.8313140726933832, "grad_norm": 0.1550972852140161, "learning_rate": 8.657231618916121e-06, "loss": 0.4642, "step": 1965 }, { "epoch": 1.832246039142591, "grad_norm": 0.15496733861993983, "learning_rate": 8.65032792544011e-06, "loss": 0.4553, "step": 1966 }, { "epoch": 1.8331780055917988, "grad_norm": 0.16946561460730175, "learning_rate": 8.643424231964101e-06, "loss": 0.4594, "step": 1967 }, { "epoch": 1.8341099720410066, "grad_norm": 0.1581858805729683, "learning_rate": 8.636520538488092e-06, "loss": 0.4416, "step": 1968 }, { "epoch": 1.8350419384902144, "grad_norm": 0.1643046418778528, "learning_rate": 8.629616845012082e-06, "loss": 0.4941, "step": 1969 }, { "epoch": 1.8359739049394221, "grad_norm": 0.15064703575809618, "learning_rate": 8.622713151536073e-06, "loss": 0.461, "step": 1970 }, { "epoch": 1.83690587138863, "grad_norm": 0.1648978552106011, "learning_rate": 8.615809458060062e-06, "loss": 0.4835, "step": 1971 }, { "epoch": 1.8378378378378377, "grad_norm": 0.16278403536371072, "learning_rate": 8.608905764584053e-06, "loss": 0.4738, "step": 1972 }, { "epoch": 1.8387698042870455, "grad_norm": 0.15157552184028053, "learning_rate": 8.602002071108044e-06, "loss": 0.4586, "step": 1973 }, { "epoch": 1.8397017707362535, "grad_norm": 0.1548950326193602, "learning_rate": 8.595098377632034e-06, "loss": 0.4773, "step": 1974 }, { "epoch": 1.8406337371854613, "grad_norm": 0.14953474316418525, "learning_rate": 8.588194684156023e-06, "loss": 0.4726, "step": 1975 }, { "epoch": 1.8415657036346693, "grad_norm": 0.15449207869349255, "learning_rate": 8.581290990680014e-06, "loss": 0.4713, "step": 1976 }, { "epoch": 1.842497670083877, "grad_norm": 0.15084499638717383, "learning_rate": 8.574387297204005e-06, "loss": 0.4692, "step": 1977 }, { "epoch": 1.843429636533085, "grad_norm": 0.15899892915181232, "learning_rate": 8.567483603727996e-06, "loss": 0.458, "step": 1978 }, { "epoch": 1.8443616029822927, "grad_norm": 0.14314626321476476, "learning_rate": 8.560579910251986e-06, "loss": 0.4339, "step": 1979 }, { "epoch": 1.8452935694315005, "grad_norm": 0.15012378471428917, "learning_rate": 8.553676216775975e-06, "loss": 0.4426, "step": 1980 }, { "epoch": 1.8462255358807083, "grad_norm": 0.16475606517431968, "learning_rate": 8.546772523299966e-06, "loss": 0.4547, "step": 1981 }, { "epoch": 1.847157502329916, "grad_norm": 0.15512806578347177, "learning_rate": 8.539868829823957e-06, "loss": 0.448, "step": 1982 }, { "epoch": 1.8480894687791238, "grad_norm": 0.14199476733313557, "learning_rate": 8.532965136347948e-06, "loss": 0.4432, "step": 1983 }, { "epoch": 1.8490214352283316, "grad_norm": 0.14854707481162605, "learning_rate": 8.526061442871937e-06, "loss": 0.4607, "step": 1984 }, { "epoch": 1.8499534016775396, "grad_norm": 0.145693323047365, "learning_rate": 8.519157749395927e-06, "loss": 0.4499, "step": 1985 }, { "epoch": 1.8508853681267474, "grad_norm": 0.1590487114545389, "learning_rate": 8.512254055919918e-06, "loss": 0.4936, "step": 1986 }, { "epoch": 1.8518173345759554, "grad_norm": 0.1677134825112216, "learning_rate": 8.505350362443909e-06, "loss": 0.4759, "step": 1987 }, { "epoch": 1.8527493010251632, "grad_norm": 0.15563998222454348, "learning_rate": 8.4984466689679e-06, "loss": 0.4769, "step": 1988 }, { "epoch": 1.853681267474371, "grad_norm": 0.1528183364620558, "learning_rate": 8.491542975491889e-06, "loss": 0.4827, "step": 1989 }, { "epoch": 1.8546132339235788, "grad_norm": 0.15139447363556582, "learning_rate": 8.48463928201588e-06, "loss": 0.4615, "step": 1990 }, { "epoch": 1.8555452003727866, "grad_norm": 0.14558324781018936, "learning_rate": 8.47773558853987e-06, "loss": 0.4294, "step": 1991 }, { "epoch": 1.8564771668219944, "grad_norm": 0.15459915956449616, "learning_rate": 8.47083189506386e-06, "loss": 0.469, "step": 1992 }, { "epoch": 1.8574091332712022, "grad_norm": 0.15935550057246192, "learning_rate": 8.46392820158785e-06, "loss": 0.4571, "step": 1993 }, { "epoch": 1.85834109972041, "grad_norm": 0.14478393088639077, "learning_rate": 8.45702450811184e-06, "loss": 0.4563, "step": 1994 }, { "epoch": 1.8592730661696177, "grad_norm": 0.14745682985866915, "learning_rate": 8.45012081463583e-06, "loss": 0.4695, "step": 1995 }, { "epoch": 1.8602050326188257, "grad_norm": 0.15566382836019516, "learning_rate": 8.443217121159822e-06, "loss": 0.4655, "step": 1996 }, { "epoch": 1.8611369990680335, "grad_norm": 0.15218137961389153, "learning_rate": 8.436313427683811e-06, "loss": 0.4561, "step": 1997 }, { "epoch": 1.8620689655172413, "grad_norm": 0.13864813383281527, "learning_rate": 8.429409734207802e-06, "loss": 0.421, "step": 1998 }, { "epoch": 1.8630009319664493, "grad_norm": 0.1540501509833112, "learning_rate": 8.422506040731793e-06, "loss": 0.4946, "step": 1999 }, { "epoch": 1.8639328984156571, "grad_norm": 0.16011004580539828, "learning_rate": 8.415602347255782e-06, "loss": 0.4452, "step": 2000 }, { "epoch": 1.864864864864865, "grad_norm": 0.15053346022104624, "learning_rate": 8.408698653779774e-06, "loss": 0.4415, "step": 2001 }, { "epoch": 1.8657968313140727, "grad_norm": 0.155435350452408, "learning_rate": 8.401794960303763e-06, "loss": 0.4768, "step": 2002 }, { "epoch": 1.8667287977632805, "grad_norm": 0.1547636333037352, "learning_rate": 8.394891266827754e-06, "loss": 0.428, "step": 2003 }, { "epoch": 1.8676607642124883, "grad_norm": 0.14654877997121715, "learning_rate": 8.387987573351743e-06, "loss": 0.4563, "step": 2004 }, { "epoch": 1.868592730661696, "grad_norm": 0.1502465341502632, "learning_rate": 8.381083879875734e-06, "loss": 0.4619, "step": 2005 }, { "epoch": 1.8695246971109039, "grad_norm": 0.1508136113980444, "learning_rate": 8.374180186399724e-06, "loss": 0.4721, "step": 2006 }, { "epoch": 1.8704566635601119, "grad_norm": 0.16930334497891275, "learning_rate": 8.367276492923715e-06, "loss": 0.4791, "step": 2007 }, { "epoch": 1.8713886300093197, "grad_norm": 0.15061786849582007, "learning_rate": 8.360372799447706e-06, "loss": 0.4656, "step": 2008 }, { "epoch": 1.8723205964585274, "grad_norm": 0.1436597246830458, "learning_rate": 8.353469105971695e-06, "loss": 0.4639, "step": 2009 }, { "epoch": 1.8732525629077355, "grad_norm": 0.153493906861198, "learning_rate": 8.346565412495686e-06, "loss": 0.4573, "step": 2010 }, { "epoch": 1.8741845293569432, "grad_norm": 0.16390045107502457, "learning_rate": 8.339661719019676e-06, "loss": 0.4416, "step": 2011 }, { "epoch": 1.875116495806151, "grad_norm": 0.15004753953597322, "learning_rate": 8.332758025543667e-06, "loss": 0.4195, "step": 2012 }, { "epoch": 1.8760484622553588, "grad_norm": 0.1594733696387208, "learning_rate": 8.325854332067656e-06, "loss": 0.4839, "step": 2013 }, { "epoch": 1.8769804287045666, "grad_norm": 0.17603638517913187, "learning_rate": 8.318950638591647e-06, "loss": 0.4706, "step": 2014 }, { "epoch": 1.8779123951537744, "grad_norm": 0.14090086792824782, "learning_rate": 8.312046945115638e-06, "loss": 0.4564, "step": 2015 }, { "epoch": 1.8788443616029822, "grad_norm": 0.15231480771116618, "learning_rate": 8.305143251639628e-06, "loss": 0.473, "step": 2016 }, { "epoch": 1.87977632805219, "grad_norm": 0.15307021237526663, "learning_rate": 8.298239558163617e-06, "loss": 0.4354, "step": 2017 }, { "epoch": 1.880708294501398, "grad_norm": 0.15504637673919874, "learning_rate": 8.291335864687608e-06, "loss": 0.4329, "step": 2018 }, { "epoch": 1.8816402609506058, "grad_norm": 0.13975477565299313, "learning_rate": 8.284432171211599e-06, "loss": 0.4484, "step": 2019 }, { "epoch": 1.8825722273998136, "grad_norm": 0.15489024472815743, "learning_rate": 8.27752847773559e-06, "loss": 0.4576, "step": 2020 }, { "epoch": 1.8835041938490216, "grad_norm": 0.15409014562835657, "learning_rate": 8.27062478425958e-06, "loss": 0.4599, "step": 2021 }, { "epoch": 1.8844361602982294, "grad_norm": 0.14811085147132255, "learning_rate": 8.26372109078357e-06, "loss": 0.4453, "step": 2022 }, { "epoch": 1.8853681267474371, "grad_norm": 0.14757106817422722, "learning_rate": 8.25681739730756e-06, "loss": 0.4563, "step": 2023 }, { "epoch": 1.886300093196645, "grad_norm": 0.15551079008449786, "learning_rate": 8.24991370383155e-06, "loss": 0.4424, "step": 2024 }, { "epoch": 1.8872320596458527, "grad_norm": 0.15552702543097668, "learning_rate": 8.243010010355541e-06, "loss": 0.4563, "step": 2025 }, { "epoch": 1.8881640260950605, "grad_norm": 0.1505227252542653, "learning_rate": 8.23610631687953e-06, "loss": 0.4726, "step": 2026 }, { "epoch": 1.8890959925442683, "grad_norm": 0.14464927960459625, "learning_rate": 8.229202623403521e-06, "loss": 0.4535, "step": 2027 }, { "epoch": 1.890027958993476, "grad_norm": 0.14969532441653327, "learning_rate": 8.222298929927512e-06, "loss": 0.4451, "step": 2028 }, { "epoch": 1.890959925442684, "grad_norm": 0.1484846247286392, "learning_rate": 8.215395236451503e-06, "loss": 0.4668, "step": 2029 }, { "epoch": 1.8918918918918919, "grad_norm": 0.22311985172086046, "learning_rate": 8.208491542975493e-06, "loss": 0.4641, "step": 2030 }, { "epoch": 1.8928238583410997, "grad_norm": 0.151718343937441, "learning_rate": 8.201587849499482e-06, "loss": 0.4749, "step": 2031 }, { "epoch": 1.8937558247903077, "grad_norm": 0.1558462824914481, "learning_rate": 8.194684156023473e-06, "loss": 0.4614, "step": 2032 }, { "epoch": 1.8946877912395155, "grad_norm": 0.14964718180457418, "learning_rate": 8.187780462547464e-06, "loss": 0.4596, "step": 2033 }, { "epoch": 1.8956197576887233, "grad_norm": 0.13770029175154458, "learning_rate": 8.180876769071455e-06, "loss": 0.4261, "step": 2034 }, { "epoch": 1.896551724137931, "grad_norm": 0.14421945489536203, "learning_rate": 8.173973075595444e-06, "loss": 0.4466, "step": 2035 }, { "epoch": 1.8974836905871388, "grad_norm": 0.18511098808341628, "learning_rate": 8.167069382119434e-06, "loss": 0.4823, "step": 2036 }, { "epoch": 1.8984156570363466, "grad_norm": 0.14016459161154443, "learning_rate": 8.160165688643425e-06, "loss": 0.4176, "step": 2037 }, { "epoch": 1.8993476234855544, "grad_norm": 0.1513950893392854, "learning_rate": 8.153261995167416e-06, "loss": 0.4584, "step": 2038 }, { "epoch": 1.9002795899347622, "grad_norm": 0.14595302143397648, "learning_rate": 8.146358301691407e-06, "loss": 0.4777, "step": 2039 }, { "epoch": 1.9012115563839702, "grad_norm": 0.14505506760865103, "learning_rate": 8.139454608215396e-06, "loss": 0.4538, "step": 2040 }, { "epoch": 1.902143522833178, "grad_norm": 0.15549769162115362, "learning_rate": 8.132550914739386e-06, "loss": 0.4583, "step": 2041 }, { "epoch": 1.9030754892823858, "grad_norm": 0.1545532298976326, "learning_rate": 8.125647221263377e-06, "loss": 0.4923, "step": 2042 }, { "epoch": 1.9040074557315938, "grad_norm": 0.15228648683879625, "learning_rate": 8.118743527787368e-06, "loss": 0.5066, "step": 2043 }, { "epoch": 1.9049394221808016, "grad_norm": 0.15072156448254212, "learning_rate": 8.111839834311357e-06, "loss": 0.4732, "step": 2044 }, { "epoch": 1.9058713886300094, "grad_norm": 0.15552171473885848, "learning_rate": 8.104936140835348e-06, "loss": 0.4496, "step": 2045 }, { "epoch": 1.9068033550792172, "grad_norm": 0.16186508252382323, "learning_rate": 8.098032447359337e-06, "loss": 0.4588, "step": 2046 }, { "epoch": 1.907735321528425, "grad_norm": 0.1523048767921694, "learning_rate": 8.091128753883329e-06, "loss": 0.4478, "step": 2047 }, { "epoch": 1.9086672879776327, "grad_norm": 0.17072645285890758, "learning_rate": 8.084225060407318e-06, "loss": 0.5034, "step": 2048 }, { "epoch": 1.9095992544268405, "grad_norm": 0.15707832319261594, "learning_rate": 8.077321366931309e-06, "loss": 0.4444, "step": 2049 }, { "epoch": 1.9105312208760483, "grad_norm": 0.15247044657873593, "learning_rate": 8.0704176734553e-06, "loss": 0.4744, "step": 2050 }, { "epoch": 1.9114631873252563, "grad_norm": 0.15072773493334318, "learning_rate": 8.063513979979289e-06, "loss": 0.4801, "step": 2051 }, { "epoch": 1.9123951537744641, "grad_norm": 0.1482402719509801, "learning_rate": 8.056610286503281e-06, "loss": 0.4365, "step": 2052 }, { "epoch": 1.913327120223672, "grad_norm": 0.1454921320823503, "learning_rate": 8.04970659302727e-06, "loss": 0.4597, "step": 2053 }, { "epoch": 1.91425908667288, "grad_norm": 0.15300775201861985, "learning_rate": 8.04280289955126e-06, "loss": 0.4524, "step": 2054 }, { "epoch": 1.9151910531220877, "grad_norm": 0.1628711606674642, "learning_rate": 8.03589920607525e-06, "loss": 0.4557, "step": 2055 }, { "epoch": 1.9161230195712955, "grad_norm": 0.14461532310086847, "learning_rate": 8.02899551259924e-06, "loss": 0.4478, "step": 2056 }, { "epoch": 1.9170549860205033, "grad_norm": 0.14843087008658373, "learning_rate": 8.022091819123231e-06, "loss": 0.4642, "step": 2057 }, { "epoch": 1.917986952469711, "grad_norm": 0.16067085697124836, "learning_rate": 8.015188125647222e-06, "loss": 0.4952, "step": 2058 }, { "epoch": 1.9189189189189189, "grad_norm": 0.16382307695602394, "learning_rate": 8.008284432171213e-06, "loss": 0.477, "step": 2059 }, { "epoch": 1.9198508853681266, "grad_norm": 0.1479920095629882, "learning_rate": 8.001380738695202e-06, "loss": 0.4589, "step": 2060 }, { "epoch": 1.9207828518173344, "grad_norm": 0.14091630652651332, "learning_rate": 7.994477045219193e-06, "loss": 0.4576, "step": 2061 }, { "epoch": 1.9217148182665424, "grad_norm": 0.14502693904847005, "learning_rate": 7.987573351743183e-06, "loss": 0.4564, "step": 2062 }, { "epoch": 1.9226467847157502, "grad_norm": 0.14342204622464044, "learning_rate": 7.980669658267174e-06, "loss": 0.4526, "step": 2063 }, { "epoch": 1.923578751164958, "grad_norm": 0.13678678467014332, "learning_rate": 7.973765964791163e-06, "loss": 0.4564, "step": 2064 }, { "epoch": 1.924510717614166, "grad_norm": 0.15614046130024753, "learning_rate": 7.966862271315154e-06, "loss": 0.4463, "step": 2065 }, { "epoch": 1.9254426840633738, "grad_norm": 0.14444058947887808, "learning_rate": 7.959958577839145e-06, "loss": 0.4501, "step": 2066 }, { "epoch": 1.9263746505125816, "grad_norm": 0.1530493978128678, "learning_rate": 7.953054884363135e-06, "loss": 0.4622, "step": 2067 }, { "epoch": 1.9273066169617894, "grad_norm": 0.15672936305683804, "learning_rate": 7.946151190887124e-06, "loss": 0.5012, "step": 2068 }, { "epoch": 1.9282385834109972, "grad_norm": 0.13920763566351363, "learning_rate": 7.939247497411115e-06, "loss": 0.4271, "step": 2069 }, { "epoch": 1.929170549860205, "grad_norm": 0.15094893933817652, "learning_rate": 7.932343803935106e-06, "loss": 0.4641, "step": 2070 }, { "epoch": 1.9301025163094128, "grad_norm": 0.15348339898203456, "learning_rate": 7.925440110459096e-06, "loss": 0.486, "step": 2071 }, { "epoch": 1.9310344827586206, "grad_norm": 0.14177345637071084, "learning_rate": 7.918536416983087e-06, "loss": 0.4576, "step": 2072 }, { "epoch": 1.9319664492078286, "grad_norm": 0.1432654679308148, "learning_rate": 7.911632723507076e-06, "loss": 0.4504, "step": 2073 }, { "epoch": 1.9328984156570364, "grad_norm": 0.15297443830274013, "learning_rate": 7.904729030031067e-06, "loss": 0.4736, "step": 2074 }, { "epoch": 1.9338303821062441, "grad_norm": 0.16078782147624465, "learning_rate": 7.897825336555058e-06, "loss": 0.4627, "step": 2075 }, { "epoch": 1.9347623485554521, "grad_norm": 0.15098780035270332, "learning_rate": 7.890921643079048e-06, "loss": 0.4626, "step": 2076 }, { "epoch": 1.93569431500466, "grad_norm": 0.14393343006228498, "learning_rate": 7.884017949603038e-06, "loss": 0.4565, "step": 2077 }, { "epoch": 1.9366262814538677, "grad_norm": 0.1470239153805876, "learning_rate": 7.877114256127028e-06, "loss": 0.4568, "step": 2078 }, { "epoch": 1.9375582479030755, "grad_norm": 0.15654334522554322, "learning_rate": 7.870210562651019e-06, "loss": 0.4653, "step": 2079 }, { "epoch": 1.9384902143522833, "grad_norm": 0.1490072150233691, "learning_rate": 7.86330686917501e-06, "loss": 0.4561, "step": 2080 }, { "epoch": 1.939422180801491, "grad_norm": 0.16082713925664996, "learning_rate": 7.856403175699e-06, "loss": 0.4257, "step": 2081 }, { "epoch": 1.9403541472506989, "grad_norm": 0.1451502360309991, "learning_rate": 7.84949948222299e-06, "loss": 0.4555, "step": 2082 }, { "epoch": 1.9412861136999067, "grad_norm": 0.15018124288711174, "learning_rate": 7.84259578874698e-06, "loss": 0.4557, "step": 2083 }, { "epoch": 1.9422180801491147, "grad_norm": 0.1490141698817162, "learning_rate": 7.835692095270971e-06, "loss": 0.4661, "step": 2084 }, { "epoch": 1.9431500465983225, "grad_norm": 0.14866737235403696, "learning_rate": 7.828788401794962e-06, "loss": 0.5012, "step": 2085 }, { "epoch": 1.9440820130475303, "grad_norm": 0.147045299217468, "learning_rate": 7.82188470831895e-06, "loss": 0.4768, "step": 2086 }, { "epoch": 1.9450139794967383, "grad_norm": 0.1429000065070586, "learning_rate": 7.814981014842941e-06, "loss": 0.4416, "step": 2087 }, { "epoch": 1.945945945945946, "grad_norm": 0.14099707969434946, "learning_rate": 7.808077321366932e-06, "loss": 0.459, "step": 2088 }, { "epoch": 1.9468779123951538, "grad_norm": 0.1421342963208717, "learning_rate": 7.801173627890923e-06, "loss": 0.4557, "step": 2089 }, { "epoch": 1.9478098788443616, "grad_norm": 0.15569500820314253, "learning_rate": 7.794269934414914e-06, "loss": 0.476, "step": 2090 }, { "epoch": 1.9487418452935694, "grad_norm": 0.14560423496002442, "learning_rate": 7.787366240938903e-06, "loss": 0.4411, "step": 2091 }, { "epoch": 1.9496738117427772, "grad_norm": 0.14295447744959996, "learning_rate": 7.780462547462893e-06, "loss": 0.4361, "step": 2092 }, { "epoch": 1.950605778191985, "grad_norm": 0.14616791782266897, "learning_rate": 7.773558853986884e-06, "loss": 0.4547, "step": 2093 }, { "epoch": 1.9515377446411928, "grad_norm": 0.13893891382896306, "learning_rate": 7.766655160510875e-06, "loss": 0.4189, "step": 2094 }, { "epoch": 1.9524697110904008, "grad_norm": 0.14585956685068885, "learning_rate": 7.759751467034864e-06, "loss": 0.4636, "step": 2095 }, { "epoch": 1.9534016775396086, "grad_norm": 0.2803443488749313, "learning_rate": 7.752847773558855e-06, "loss": 0.4563, "step": 2096 }, { "epoch": 1.9543336439888164, "grad_norm": 0.15073310455843555, "learning_rate": 7.745944080082844e-06, "loss": 0.4946, "step": 2097 }, { "epoch": 1.9552656104380244, "grad_norm": 0.14480242667244103, "learning_rate": 7.739040386606836e-06, "loss": 0.4614, "step": 2098 }, { "epoch": 1.9561975768872322, "grad_norm": 0.1345485944368097, "learning_rate": 7.732136693130827e-06, "loss": 0.4622, "step": 2099 }, { "epoch": 1.95712954333644, "grad_norm": 0.16148625336267644, "learning_rate": 7.725232999654816e-06, "loss": 0.461, "step": 2100 }, { "epoch": 1.9580615097856477, "grad_norm": 0.15723242048915143, "learning_rate": 7.718329306178807e-06, "loss": 0.4454, "step": 2101 }, { "epoch": 1.9589934762348555, "grad_norm": 0.13914895442687497, "learning_rate": 7.711425612702796e-06, "loss": 0.4389, "step": 2102 }, { "epoch": 1.9599254426840633, "grad_norm": 0.142615577715268, "learning_rate": 7.704521919226788e-06, "loss": 0.4462, "step": 2103 }, { "epoch": 1.9608574091332711, "grad_norm": 0.1590557164047874, "learning_rate": 7.697618225750777e-06, "loss": 0.5032, "step": 2104 }, { "epoch": 1.961789375582479, "grad_norm": 0.15240704712192402, "learning_rate": 7.690714532274768e-06, "loss": 0.4619, "step": 2105 }, { "epoch": 1.962721342031687, "grad_norm": 0.15841890030970432, "learning_rate": 7.683810838798757e-06, "loss": 0.4541, "step": 2106 }, { "epoch": 1.9636533084808947, "grad_norm": 0.14255412456502117, "learning_rate": 7.676907145322748e-06, "loss": 0.4399, "step": 2107 }, { "epoch": 1.9645852749301025, "grad_norm": 0.15775007528540688, "learning_rate": 7.670003451846738e-06, "loss": 0.5076, "step": 2108 }, { "epoch": 1.9655172413793105, "grad_norm": 0.1457296782403031, "learning_rate": 7.663099758370729e-06, "loss": 0.4563, "step": 2109 }, { "epoch": 1.9664492078285183, "grad_norm": 0.1587779491953886, "learning_rate": 7.65619606489472e-06, "loss": 0.4719, "step": 2110 }, { "epoch": 1.967381174277726, "grad_norm": 0.1658444443523515, "learning_rate": 7.649292371418709e-06, "loss": 0.488, "step": 2111 }, { "epoch": 1.9683131407269339, "grad_norm": 0.14764807669585123, "learning_rate": 7.6423886779427e-06, "loss": 0.4705, "step": 2112 }, { "epoch": 1.9692451071761417, "grad_norm": 0.1399439535110299, "learning_rate": 7.63548498446669e-06, "loss": 0.4529, "step": 2113 }, { "epoch": 1.9701770736253494, "grad_norm": 0.15034016373627038, "learning_rate": 7.628581290990681e-06, "loss": 0.4656, "step": 2114 }, { "epoch": 1.9711090400745572, "grad_norm": 0.1499258747631414, "learning_rate": 7.621677597514671e-06, "loss": 0.4696, "step": 2115 }, { "epoch": 1.972041006523765, "grad_norm": 0.14101049450161546, "learning_rate": 7.614773904038662e-06, "loss": 0.4648, "step": 2116 }, { "epoch": 1.972972972972973, "grad_norm": 0.1388522060892942, "learning_rate": 7.6078702105626516e-06, "loss": 0.4594, "step": 2117 }, { "epoch": 1.9739049394221808, "grad_norm": 0.1610881601483381, "learning_rate": 7.600966517086642e-06, "loss": 0.4768, "step": 2118 }, { "epoch": 1.9748369058713886, "grad_norm": 0.14860764321403763, "learning_rate": 7.594062823610633e-06, "loss": 0.4505, "step": 2119 }, { "epoch": 1.9757688723205966, "grad_norm": 0.13792730295106612, "learning_rate": 7.587159130134623e-06, "loss": 0.4236, "step": 2120 }, { "epoch": 1.9767008387698044, "grad_norm": 0.14967852778236534, "learning_rate": 7.580255436658614e-06, "loss": 0.465, "step": 2121 }, { "epoch": 1.9776328052190122, "grad_norm": 0.15847858493595351, "learning_rate": 7.5733517431826035e-06, "loss": 0.4622, "step": 2122 }, { "epoch": 1.97856477166822, "grad_norm": 0.14482343358662936, "learning_rate": 7.566448049706594e-06, "loss": 0.4638, "step": 2123 }, { "epoch": 1.9794967381174278, "grad_norm": 0.14541449291502914, "learning_rate": 7.559544356230583e-06, "loss": 0.458, "step": 2124 }, { "epoch": 1.9804287045666356, "grad_norm": 0.13985636949374358, "learning_rate": 7.552640662754575e-06, "loss": 0.4259, "step": 2125 }, { "epoch": 1.9813606710158433, "grad_norm": 0.1514448814935872, "learning_rate": 7.545736969278564e-06, "loss": 0.4357, "step": 2126 }, { "epoch": 1.9822926374650511, "grad_norm": 0.1442984754150896, "learning_rate": 7.5388332758025555e-06, "loss": 0.4608, "step": 2127 }, { "epoch": 1.983224603914259, "grad_norm": 0.1422280059409808, "learning_rate": 7.5319295823265445e-06, "loss": 0.4774, "step": 2128 }, { "epoch": 1.984156570363467, "grad_norm": 0.1441491044271299, "learning_rate": 7.525025888850535e-06, "loss": 0.4746, "step": 2129 }, { "epoch": 1.9850885368126747, "grad_norm": 0.1613372439628054, "learning_rate": 7.518122195374527e-06, "loss": 0.4614, "step": 2130 }, { "epoch": 1.9860205032618827, "grad_norm": 0.1455792918107001, "learning_rate": 7.511218501898516e-06, "loss": 0.4445, "step": 2131 }, { "epoch": 1.9869524697110905, "grad_norm": 0.15306087748990183, "learning_rate": 7.504314808422507e-06, "loss": 0.4472, "step": 2132 }, { "epoch": 1.9878844361602983, "grad_norm": 0.13612675722648276, "learning_rate": 7.4974111149464965e-06, "loss": 0.4242, "step": 2133 }, { "epoch": 1.988816402609506, "grad_norm": 0.14323056798381162, "learning_rate": 7.490507421470487e-06, "loss": 0.4413, "step": 2134 }, { "epoch": 1.9897483690587139, "grad_norm": 0.15696489525903098, "learning_rate": 7.483603727994477e-06, "loss": 0.4593, "step": 2135 }, { "epoch": 1.9906803355079217, "grad_norm": 0.15478257090256753, "learning_rate": 7.476700034518468e-06, "loss": 0.434, "step": 2136 }, { "epoch": 1.9916123019571295, "grad_norm": 0.14600755720133562, "learning_rate": 7.469796341042458e-06, "loss": 0.4696, "step": 2137 }, { "epoch": 1.9925442684063372, "grad_norm": 0.14285927209260899, "learning_rate": 7.4628926475664485e-06, "loss": 0.4481, "step": 2138 }, { "epoch": 1.993476234855545, "grad_norm": 0.15187551365565713, "learning_rate": 7.455988954090439e-06, "loss": 0.4571, "step": 2139 }, { "epoch": 1.994408201304753, "grad_norm": 0.15403813330158322, "learning_rate": 7.449085260614429e-06, "loss": 0.4771, "step": 2140 }, { "epoch": 1.9953401677539608, "grad_norm": 0.13869249150248592, "learning_rate": 7.44218156713842e-06, "loss": 0.443, "step": 2141 }, { "epoch": 1.9962721342031688, "grad_norm": 0.14496582961769688, "learning_rate": 7.43527787366241e-06, "loss": 0.4531, "step": 2142 }, { "epoch": 1.9972041006523766, "grad_norm": 0.13182223253418546, "learning_rate": 7.4283741801864004e-06, "loss": 0.436, "step": 2143 }, { "epoch": 1.9981360671015844, "grad_norm": 0.14872329568378329, "learning_rate": 7.42147048671039e-06, "loss": 0.4919, "step": 2144 }, { "epoch": 1.9990680335507922, "grad_norm": 0.16196461778018756, "learning_rate": 7.414566793234381e-06, "loss": 0.4803, "step": 2145 }, { "epoch": 2.0, "grad_norm": 0.13847180438773227, "learning_rate": 7.407663099758371e-06, "loss": 0.4275, "step": 2146 }, { "epoch": 2.000931966449208, "grad_norm": 0.14205399169563498, "learning_rate": 7.400759406282362e-06, "loss": 0.4382, "step": 2147 }, { "epoch": 2.0018639328984156, "grad_norm": 0.1381862817718021, "learning_rate": 7.3938557128063516e-06, "loss": 0.4242, "step": 2148 }, { "epoch": 2.0027958993476234, "grad_norm": 0.14114992712662475, "learning_rate": 7.386952019330342e-06, "loss": 0.4276, "step": 2149 }, { "epoch": 2.003727865796831, "grad_norm": 0.14176040102316778, "learning_rate": 7.380048325854333e-06, "loss": 0.4255, "step": 2150 }, { "epoch": 2.004659832246039, "grad_norm": 0.14511128531565837, "learning_rate": 7.373144632378323e-06, "loss": 0.4638, "step": 2151 }, { "epoch": 2.005591798695247, "grad_norm": 0.15933344177197767, "learning_rate": 7.366240938902314e-06, "loss": 0.4352, "step": 2152 }, { "epoch": 2.006523765144455, "grad_norm": 0.15443337924863218, "learning_rate": 7.3593372454263035e-06, "loss": 0.4738, "step": 2153 }, { "epoch": 2.0074557315936628, "grad_norm": 0.15332530669714126, "learning_rate": 7.352433551950294e-06, "loss": 0.4859, "step": 2154 }, { "epoch": 2.0083876980428705, "grad_norm": 0.15558989550261854, "learning_rate": 7.345529858474284e-06, "loss": 0.4336, "step": 2155 }, { "epoch": 2.0093196644920783, "grad_norm": 0.16253883062202878, "learning_rate": 7.338626164998275e-06, "loss": 0.4651, "step": 2156 }, { "epoch": 2.010251630941286, "grad_norm": 0.15566561062835088, "learning_rate": 7.331722471522265e-06, "loss": 0.4589, "step": 2157 }, { "epoch": 2.011183597390494, "grad_norm": 0.1504920481862383, "learning_rate": 7.3248187780462555e-06, "loss": 0.4292, "step": 2158 }, { "epoch": 2.0121155638397017, "grad_norm": 0.14074018700581, "learning_rate": 7.317915084570245e-06, "loss": 0.4377, "step": 2159 }, { "epoch": 2.0130475302889095, "grad_norm": 0.15165960295928868, "learning_rate": 7.311011391094236e-06, "loss": 0.4476, "step": 2160 }, { "epoch": 2.0139794967381173, "grad_norm": 0.15975296564114666, "learning_rate": 7.304107697618227e-06, "loss": 0.4707, "step": 2161 }, { "epoch": 2.014911463187325, "grad_norm": 0.15541415965136401, "learning_rate": 7.297204004142217e-06, "loss": 0.4223, "step": 2162 }, { "epoch": 2.0158434296365333, "grad_norm": 0.1538982667564575, "learning_rate": 7.2903003106662075e-06, "loss": 0.4424, "step": 2163 }, { "epoch": 2.016775396085741, "grad_norm": 0.14577678530880508, "learning_rate": 7.283396617190197e-06, "loss": 0.4339, "step": 2164 }, { "epoch": 2.017707362534949, "grad_norm": 0.14799653568116716, "learning_rate": 7.276492923714188e-06, "loss": 0.4621, "step": 2165 }, { "epoch": 2.0186393289841567, "grad_norm": 0.14921089057319156, "learning_rate": 7.269589230238178e-06, "loss": 0.4318, "step": 2166 }, { "epoch": 2.0195712954333644, "grad_norm": 0.1516144828004112, "learning_rate": 7.262685536762169e-06, "loss": 0.4613, "step": 2167 }, { "epoch": 2.0205032618825722, "grad_norm": 0.15538181701825818, "learning_rate": 7.255781843286159e-06, "loss": 0.4701, "step": 2168 }, { "epoch": 2.02143522833178, "grad_norm": 0.13964432715959965, "learning_rate": 7.248878149810149e-06, "loss": 0.4439, "step": 2169 }, { "epoch": 2.022367194780988, "grad_norm": 0.14611071876792064, "learning_rate": 7.24197445633414e-06, "loss": 0.4365, "step": 2170 }, { "epoch": 2.0232991612301956, "grad_norm": 0.1421631667425636, "learning_rate": 7.23507076285813e-06, "loss": 0.4401, "step": 2171 }, { "epoch": 2.0242311276794034, "grad_norm": 0.150853770107673, "learning_rate": 7.228167069382121e-06, "loss": 0.4495, "step": 2172 }, { "epoch": 2.025163094128611, "grad_norm": 0.15070576146932002, "learning_rate": 7.2212633759061106e-06, "loss": 0.454, "step": 2173 }, { "epoch": 2.0260950605778194, "grad_norm": 0.15060190739327134, "learning_rate": 7.214359682430101e-06, "loss": 0.4339, "step": 2174 }, { "epoch": 2.027027027027027, "grad_norm": 0.1491003942675449, "learning_rate": 7.20745598895409e-06, "loss": 0.4211, "step": 2175 }, { "epoch": 2.027958993476235, "grad_norm": 0.14754377654732972, "learning_rate": 7.200552295478082e-06, "loss": 0.4303, "step": 2176 }, { "epoch": 2.0288909599254428, "grad_norm": 0.14517084785974962, "learning_rate": 7.193648602002071e-06, "loss": 0.4559, "step": 2177 }, { "epoch": 2.0298229263746506, "grad_norm": 0.14091991497611733, "learning_rate": 7.1867449085260625e-06, "loss": 0.4555, "step": 2178 }, { "epoch": 2.0307548928238583, "grad_norm": 0.1420584245173149, "learning_rate": 7.1798412150500516e-06, "loss": 0.4616, "step": 2179 }, { "epoch": 2.031686859273066, "grad_norm": 0.15225745960371284, "learning_rate": 7.172937521574042e-06, "loss": 0.4546, "step": 2180 }, { "epoch": 2.032618825722274, "grad_norm": 0.14728126696052887, "learning_rate": 7.166033828098034e-06, "loss": 0.463, "step": 2181 }, { "epoch": 2.0335507921714817, "grad_norm": 0.13714803693001895, "learning_rate": 7.159130134622023e-06, "loss": 0.4568, "step": 2182 }, { "epoch": 2.0344827586206895, "grad_norm": 0.14512372731392364, "learning_rate": 7.152226441146014e-06, "loss": 0.4524, "step": 2183 }, { "epoch": 2.0354147250698973, "grad_norm": 0.14231167447632354, "learning_rate": 7.1453227476700035e-06, "loss": 0.4321, "step": 2184 }, { "epoch": 2.0363466915191055, "grad_norm": 0.14034957062370607, "learning_rate": 7.138419054193994e-06, "loss": 0.409, "step": 2185 }, { "epoch": 2.0372786579683133, "grad_norm": 0.1409223397349251, "learning_rate": 7.131515360717984e-06, "loss": 0.4348, "step": 2186 }, { "epoch": 2.038210624417521, "grad_norm": 0.14631634943378075, "learning_rate": 7.124611667241975e-06, "loss": 0.4644, "step": 2187 }, { "epoch": 2.039142590866729, "grad_norm": 0.14066149416393803, "learning_rate": 7.117707973765965e-06, "loss": 0.448, "step": 2188 }, { "epoch": 2.0400745573159367, "grad_norm": 0.13711980732725634, "learning_rate": 7.1108042802899555e-06, "loss": 0.4299, "step": 2189 }, { "epoch": 2.0410065237651445, "grad_norm": 0.14899154619972788, "learning_rate": 7.103900586813946e-06, "loss": 0.4495, "step": 2190 }, { "epoch": 2.0419384902143523, "grad_norm": 0.14804383323027973, "learning_rate": 7.096996893337936e-06, "loss": 0.4793, "step": 2191 }, { "epoch": 2.04287045666356, "grad_norm": 0.14924168779139665, "learning_rate": 7.090093199861927e-06, "loss": 0.4553, "step": 2192 }, { "epoch": 2.043802423112768, "grad_norm": 0.14744285312145838, "learning_rate": 7.083189506385917e-06, "loss": 0.4362, "step": 2193 }, { "epoch": 2.0447343895619756, "grad_norm": 0.14186259590418898, "learning_rate": 7.0762858129099075e-06, "loss": 0.4497, "step": 2194 }, { "epoch": 2.0456663560111834, "grad_norm": 0.15260926493678617, "learning_rate": 7.069382119433897e-06, "loss": 0.4941, "step": 2195 }, { "epoch": 2.0465983224603916, "grad_norm": 0.14601444953147044, "learning_rate": 7.062478425957888e-06, "loss": 0.4695, "step": 2196 }, { "epoch": 2.0475302889095994, "grad_norm": 0.14011860842693796, "learning_rate": 7.055574732481878e-06, "loss": 0.43, "step": 2197 }, { "epoch": 2.048462255358807, "grad_norm": 0.1481081086365031, "learning_rate": 7.048671039005869e-06, "loss": 0.4725, "step": 2198 }, { "epoch": 2.049394221808015, "grad_norm": 0.1497035323640936, "learning_rate": 7.041767345529859e-06, "loss": 0.4801, "step": 2199 }, { "epoch": 2.050326188257223, "grad_norm": 0.14093623867063754, "learning_rate": 7.034863652053849e-06, "loss": 0.4404, "step": 2200 }, { "epoch": 2.0512581547064306, "grad_norm": 0.1349864747791009, "learning_rate": 7.02795995857784e-06, "loss": 0.4399, "step": 2201 }, { "epoch": 2.0521901211556384, "grad_norm": 0.1472126938261296, "learning_rate": 7.02105626510183e-06, "loss": 0.4629, "step": 2202 }, { "epoch": 2.053122087604846, "grad_norm": 0.14313912024199113, "learning_rate": 7.014152571625821e-06, "loss": 0.4438, "step": 2203 }, { "epoch": 2.054054054054054, "grad_norm": 0.15217116891270124, "learning_rate": 7.0072488781498106e-06, "loss": 0.4246, "step": 2204 }, { "epoch": 2.0549860205032617, "grad_norm": 0.14099987283379686, "learning_rate": 7.000345184673801e-06, "loss": 0.4448, "step": 2205 }, { "epoch": 2.0559179869524695, "grad_norm": 0.15275981875288897, "learning_rate": 6.993441491197791e-06, "loss": 0.4643, "step": 2206 }, { "epoch": 2.0568499534016778, "grad_norm": 0.14225131476717118, "learning_rate": 6.986537797721782e-06, "loss": 0.4476, "step": 2207 }, { "epoch": 2.0577819198508855, "grad_norm": 0.14099922642654006, "learning_rate": 6.979634104245772e-06, "loss": 0.4352, "step": 2208 }, { "epoch": 2.0587138863000933, "grad_norm": 0.15157096038270448, "learning_rate": 6.9727304107697625e-06, "loss": 0.45, "step": 2209 }, { "epoch": 2.059645852749301, "grad_norm": 0.1499029948346143, "learning_rate": 6.965826717293753e-06, "loss": 0.4298, "step": 2210 }, { "epoch": 2.060577819198509, "grad_norm": 0.1400792388942806, "learning_rate": 6.958923023817743e-06, "loss": 0.4368, "step": 2211 }, { "epoch": 2.0615097856477167, "grad_norm": 0.14284617925414492, "learning_rate": 6.952019330341734e-06, "loss": 0.4293, "step": 2212 }, { "epoch": 2.0624417520969245, "grad_norm": 0.15996621870923486, "learning_rate": 6.945115636865724e-06, "loss": 0.4636, "step": 2213 }, { "epoch": 2.0633737185461323, "grad_norm": 0.13934137702738292, "learning_rate": 6.9382119433897145e-06, "loss": 0.4347, "step": 2214 }, { "epoch": 2.06430568499534, "grad_norm": 0.14954860815022655, "learning_rate": 6.931308249913704e-06, "loss": 0.4684, "step": 2215 }, { "epoch": 2.065237651444548, "grad_norm": 0.13098817550218075, "learning_rate": 6.924404556437695e-06, "loss": 0.4098, "step": 2216 }, { "epoch": 2.0661696178937556, "grad_norm": 0.14497558886278505, "learning_rate": 6.917500862961685e-06, "loss": 0.4647, "step": 2217 }, { "epoch": 2.0671015843429634, "grad_norm": 0.13732294827255043, "learning_rate": 6.910597169485676e-06, "loss": 0.4142, "step": 2218 }, { "epoch": 2.0680335507921717, "grad_norm": 0.14395939847872272, "learning_rate": 6.903693476009666e-06, "loss": 0.4493, "step": 2219 }, { "epoch": 2.0689655172413794, "grad_norm": 0.13971950194469154, "learning_rate": 6.896789782533656e-06, "loss": 0.4403, "step": 2220 }, { "epoch": 2.0698974836905872, "grad_norm": 0.1435734269740254, "learning_rate": 6.889886089057647e-06, "loss": 0.4351, "step": 2221 }, { "epoch": 2.070829450139795, "grad_norm": 0.1361956399388574, "learning_rate": 6.882982395581637e-06, "loss": 0.4423, "step": 2222 }, { "epoch": 2.071761416589003, "grad_norm": 0.14112148297948823, "learning_rate": 6.876078702105628e-06, "loss": 0.4403, "step": 2223 }, { "epoch": 2.0726933830382106, "grad_norm": 0.14053084764626197, "learning_rate": 6.869175008629618e-06, "loss": 0.4232, "step": 2224 }, { "epoch": 2.0736253494874184, "grad_norm": 0.13488771722306026, "learning_rate": 6.862271315153608e-06, "loss": 0.4251, "step": 2225 }, { "epoch": 2.074557315936626, "grad_norm": 0.13082998193400494, "learning_rate": 6.855367621677597e-06, "loss": 0.4055, "step": 2226 }, { "epoch": 2.075489282385834, "grad_norm": 0.1460030618194874, "learning_rate": 6.848463928201589e-06, "loss": 0.4643, "step": 2227 }, { "epoch": 2.0764212488350418, "grad_norm": 0.14799000997245892, "learning_rate": 6.841560234725578e-06, "loss": 0.4396, "step": 2228 }, { "epoch": 2.0773532152842495, "grad_norm": 0.1465210205978065, "learning_rate": 6.834656541249569e-06, "loss": 0.4399, "step": 2229 }, { "epoch": 2.0782851817334578, "grad_norm": 0.1393087045100752, "learning_rate": 6.82775284777356e-06, "loss": 0.4319, "step": 2230 }, { "epoch": 2.0792171481826656, "grad_norm": 0.14411565762270775, "learning_rate": 6.820849154297549e-06, "loss": 0.4858, "step": 2231 }, { "epoch": 2.0801491146318734, "grad_norm": 0.13940807286837914, "learning_rate": 6.813945460821541e-06, "loss": 0.4238, "step": 2232 }, { "epoch": 2.081081081081081, "grad_norm": 0.13977478066033205, "learning_rate": 6.80704176734553e-06, "loss": 0.4287, "step": 2233 }, { "epoch": 2.082013047530289, "grad_norm": 0.1453636748820104, "learning_rate": 6.800138073869521e-06, "loss": 0.4337, "step": 2234 }, { "epoch": 2.0829450139794967, "grad_norm": 0.1335615772911499, "learning_rate": 6.7932343803935106e-06, "loss": 0.4217, "step": 2235 }, { "epoch": 2.0838769804287045, "grad_norm": 0.1472570955769758, "learning_rate": 6.786330686917501e-06, "loss": 0.4489, "step": 2236 }, { "epoch": 2.0848089468779123, "grad_norm": 0.15186688493584616, "learning_rate": 6.779426993441491e-06, "loss": 0.4776, "step": 2237 }, { "epoch": 2.08574091332712, "grad_norm": 0.15160153360965498, "learning_rate": 6.772523299965482e-06, "loss": 0.4731, "step": 2238 }, { "epoch": 2.086672879776328, "grad_norm": 0.15882826675610515, "learning_rate": 6.765619606489472e-06, "loss": 0.4456, "step": 2239 }, { "epoch": 2.0876048462255357, "grad_norm": 0.1361696880855179, "learning_rate": 6.7587159130134625e-06, "loss": 0.4543, "step": 2240 }, { "epoch": 2.088536812674744, "grad_norm": 0.14205874477524225, "learning_rate": 6.751812219537453e-06, "loss": 0.4421, "step": 2241 }, { "epoch": 2.0894687791239517, "grad_norm": 0.14395054612613092, "learning_rate": 6.744908526061443e-06, "loss": 0.4529, "step": 2242 }, { "epoch": 2.0904007455731595, "grad_norm": 0.14192160541224855, "learning_rate": 6.738004832585434e-06, "loss": 0.4401, "step": 2243 }, { "epoch": 2.0913327120223673, "grad_norm": 0.14112522480494602, "learning_rate": 6.731101139109424e-06, "loss": 0.4406, "step": 2244 }, { "epoch": 2.092264678471575, "grad_norm": 0.15428730584296596, "learning_rate": 6.7241974456334145e-06, "loss": 0.4509, "step": 2245 }, { "epoch": 2.093196644920783, "grad_norm": 0.15158416786557907, "learning_rate": 6.717293752157404e-06, "loss": 0.4494, "step": 2246 }, { "epoch": 2.0941286113699906, "grad_norm": 0.14609988782891678, "learning_rate": 6.710390058681395e-06, "loss": 0.4772, "step": 2247 }, { "epoch": 2.0950605778191984, "grad_norm": 0.14323567115831876, "learning_rate": 6.703486365205385e-06, "loss": 0.4309, "step": 2248 }, { "epoch": 2.095992544268406, "grad_norm": 0.13394889484672215, "learning_rate": 6.696582671729376e-06, "loss": 0.4491, "step": 2249 }, { "epoch": 2.096924510717614, "grad_norm": 0.14919143733364817, "learning_rate": 6.6896789782533665e-06, "loss": 0.4543, "step": 2250 }, { "epoch": 2.0978564771668218, "grad_norm": 0.1565401609445181, "learning_rate": 6.682775284777356e-06, "loss": 0.4514, "step": 2251 }, { "epoch": 2.09878844361603, "grad_norm": 0.15001956672670105, "learning_rate": 6.675871591301347e-06, "loss": 0.4468, "step": 2252 }, { "epoch": 2.099720410065238, "grad_norm": 0.18807878132762706, "learning_rate": 6.668967897825337e-06, "loss": 0.4511, "step": 2253 }, { "epoch": 2.1006523765144456, "grad_norm": 0.14943326278590413, "learning_rate": 6.662064204349328e-06, "loss": 0.4546, "step": 2254 }, { "epoch": 2.1015843429636534, "grad_norm": 0.14141621003832675, "learning_rate": 6.655160510873318e-06, "loss": 0.4759, "step": 2255 }, { "epoch": 2.102516309412861, "grad_norm": 0.1465930640631897, "learning_rate": 6.648256817397308e-06, "loss": 0.4726, "step": 2256 }, { "epoch": 2.103448275862069, "grad_norm": 0.14413868931831952, "learning_rate": 6.641353123921298e-06, "loss": 0.4325, "step": 2257 }, { "epoch": 2.1043802423112767, "grad_norm": 0.13993395343054604, "learning_rate": 6.634449430445289e-06, "loss": 0.4645, "step": 2258 }, { "epoch": 2.1053122087604845, "grad_norm": 0.14672248766986742, "learning_rate": 6.627545736969279e-06, "loss": 0.4537, "step": 2259 }, { "epoch": 2.1062441752096923, "grad_norm": 0.14101826860898992, "learning_rate": 6.6206420434932696e-06, "loss": 0.4276, "step": 2260 }, { "epoch": 2.1071761416589, "grad_norm": 0.14181741778202187, "learning_rate": 6.61373835001726e-06, "loss": 0.458, "step": 2261 }, { "epoch": 2.108108108108108, "grad_norm": 0.14789415353893895, "learning_rate": 6.60683465654125e-06, "loss": 0.439, "step": 2262 }, { "epoch": 2.109040074557316, "grad_norm": 0.14633608442061805, "learning_rate": 6.599930963065241e-06, "loss": 0.4562, "step": 2263 }, { "epoch": 2.109972041006524, "grad_norm": 0.14746048406273893, "learning_rate": 6.593027269589231e-06, "loss": 0.4309, "step": 2264 }, { "epoch": 2.1109040074557317, "grad_norm": 0.14868941831460655, "learning_rate": 6.5861235761132215e-06, "loss": 0.4438, "step": 2265 }, { "epoch": 2.1118359739049395, "grad_norm": 0.1412213894110898, "learning_rate": 6.579219882637211e-06, "loss": 0.4386, "step": 2266 }, { "epoch": 2.1127679403541473, "grad_norm": 0.14910724453917276, "learning_rate": 6.572316189161202e-06, "loss": 0.4517, "step": 2267 }, { "epoch": 2.113699906803355, "grad_norm": 0.14032477276738334, "learning_rate": 6.565412495685192e-06, "loss": 0.446, "step": 2268 }, { "epoch": 2.114631873252563, "grad_norm": 0.14108473580673958, "learning_rate": 6.558508802209183e-06, "loss": 0.4511, "step": 2269 }, { "epoch": 2.1155638397017706, "grad_norm": 0.15830442431448266, "learning_rate": 6.551605108733173e-06, "loss": 0.4933, "step": 2270 }, { "epoch": 2.1164958061509784, "grad_norm": 0.14745113257010484, "learning_rate": 6.544701415257163e-06, "loss": 0.4834, "step": 2271 }, { "epoch": 2.117427772600186, "grad_norm": 0.1487666957668755, "learning_rate": 6.537797721781154e-06, "loss": 0.4372, "step": 2272 }, { "epoch": 2.118359739049394, "grad_norm": 0.14013144483148177, "learning_rate": 6.530894028305144e-06, "loss": 0.4404, "step": 2273 }, { "epoch": 2.1192917054986022, "grad_norm": 0.15703736434246834, "learning_rate": 6.523990334829135e-06, "loss": 0.4587, "step": 2274 }, { "epoch": 2.12022367194781, "grad_norm": 0.14469905435917363, "learning_rate": 6.517086641353124e-06, "loss": 0.4168, "step": 2275 }, { "epoch": 2.121155638397018, "grad_norm": 0.15560937746018635, "learning_rate": 6.510182947877115e-06, "loss": 0.4423, "step": 2276 }, { "epoch": 2.1220876048462256, "grad_norm": 0.14368098962929646, "learning_rate": 6.503279254401104e-06, "loss": 0.4393, "step": 2277 }, { "epoch": 2.1230195712954334, "grad_norm": 0.14463576367792486, "learning_rate": 6.496375560925096e-06, "loss": 0.4691, "step": 2278 }, { "epoch": 2.123951537744641, "grad_norm": 0.13913003779823455, "learning_rate": 6.489471867449085e-06, "loss": 0.436, "step": 2279 }, { "epoch": 2.124883504193849, "grad_norm": 0.1605244287449641, "learning_rate": 6.482568173973076e-06, "loss": 0.4721, "step": 2280 }, { "epoch": 2.1258154706430568, "grad_norm": 0.1361253031957673, "learning_rate": 6.475664480497067e-06, "loss": 0.4121, "step": 2281 }, { "epoch": 2.1267474370922645, "grad_norm": 0.1447298762977716, "learning_rate": 6.468760787021056e-06, "loss": 0.4388, "step": 2282 }, { "epoch": 2.1276794035414723, "grad_norm": 0.14604099550299623, "learning_rate": 6.461857093545048e-06, "loss": 0.4477, "step": 2283 }, { "epoch": 2.12861136999068, "grad_norm": 0.1537247421925507, "learning_rate": 6.454953400069037e-06, "loss": 0.4414, "step": 2284 }, { "epoch": 2.1295433364398884, "grad_norm": 0.13930346777124147, "learning_rate": 6.448049706593028e-06, "loss": 0.4271, "step": 2285 }, { "epoch": 2.130475302889096, "grad_norm": 0.14595411837769348, "learning_rate": 6.441146013117018e-06, "loss": 0.4545, "step": 2286 }, { "epoch": 2.131407269338304, "grad_norm": 0.14953281152777675, "learning_rate": 6.434242319641008e-06, "loss": 0.4617, "step": 2287 }, { "epoch": 2.1323392357875117, "grad_norm": 0.15495327506483259, "learning_rate": 6.427338626164998e-06, "loss": 0.4352, "step": 2288 }, { "epoch": 2.1332712022367195, "grad_norm": 0.14667599624317298, "learning_rate": 6.420434932688989e-06, "loss": 0.4485, "step": 2289 }, { "epoch": 2.1342031686859273, "grad_norm": 0.1429329873197549, "learning_rate": 6.413531239212979e-06, "loss": 0.4334, "step": 2290 }, { "epoch": 2.135135135135135, "grad_norm": 0.13389816334702054, "learning_rate": 6.4066275457369696e-06, "loss": 0.4375, "step": 2291 }, { "epoch": 2.136067101584343, "grad_norm": 0.1524867935593966, "learning_rate": 6.39972385226096e-06, "loss": 0.4899, "step": 2292 }, { "epoch": 2.1369990680335507, "grad_norm": 0.14797611856380574, "learning_rate": 6.39282015878495e-06, "loss": 0.4011, "step": 2293 }, { "epoch": 2.1379310344827585, "grad_norm": 0.15350026645958348, "learning_rate": 6.385916465308941e-06, "loss": 0.4575, "step": 2294 }, { "epoch": 2.1388630009319662, "grad_norm": 0.14182606268349915, "learning_rate": 6.379012771832931e-06, "loss": 0.4528, "step": 2295 }, { "epoch": 2.1397949673811745, "grad_norm": 0.14749153864417686, "learning_rate": 6.3721090783569215e-06, "loss": 0.4599, "step": 2296 }, { "epoch": 2.1407269338303823, "grad_norm": 0.14407907902285438, "learning_rate": 6.365205384880911e-06, "loss": 0.4485, "step": 2297 }, { "epoch": 2.14165890027959, "grad_norm": 0.14579746642838592, "learning_rate": 6.358301691404902e-06, "loss": 0.4576, "step": 2298 }, { "epoch": 2.142590866728798, "grad_norm": 0.13830899833113666, "learning_rate": 6.351397997928892e-06, "loss": 0.4151, "step": 2299 }, { "epoch": 2.1435228331780056, "grad_norm": 0.14730206587166533, "learning_rate": 6.344494304452883e-06, "loss": 0.4622, "step": 2300 }, { "epoch": 2.1444547996272134, "grad_norm": 0.1406168631710061, "learning_rate": 6.3375906109768735e-06, "loss": 0.4371, "step": 2301 }, { "epoch": 2.145386766076421, "grad_norm": 0.15113587191727237, "learning_rate": 6.330686917500863e-06, "loss": 0.4735, "step": 2302 }, { "epoch": 2.146318732525629, "grad_norm": 0.13875888396134461, "learning_rate": 6.323783224024854e-06, "loss": 0.4392, "step": 2303 }, { "epoch": 2.147250698974837, "grad_norm": 0.13892450770693573, "learning_rate": 6.316879530548844e-06, "loss": 0.425, "step": 2304 }, { "epoch": 2.1481826654240446, "grad_norm": 0.1507600750657832, "learning_rate": 6.309975837072835e-06, "loss": 0.4402, "step": 2305 }, { "epoch": 2.1491146318732524, "grad_norm": 0.14689749833283106, "learning_rate": 6.303072143596825e-06, "loss": 0.4153, "step": 2306 }, { "epoch": 2.1500465983224606, "grad_norm": 0.1504946228028015, "learning_rate": 6.296168450120815e-06, "loss": 0.4786, "step": 2307 }, { "epoch": 2.1509785647716684, "grad_norm": 0.14246024727848533, "learning_rate": 6.289264756644805e-06, "loss": 0.4551, "step": 2308 }, { "epoch": 2.151910531220876, "grad_norm": 0.14174715576963096, "learning_rate": 6.282361063168796e-06, "loss": 0.4531, "step": 2309 }, { "epoch": 2.152842497670084, "grad_norm": 0.14829762323223034, "learning_rate": 6.275457369692786e-06, "loss": 0.4201, "step": 2310 }, { "epoch": 2.1537744641192917, "grad_norm": 0.16206749540396234, "learning_rate": 6.268553676216777e-06, "loss": 0.4691, "step": 2311 }, { "epoch": 2.1547064305684995, "grad_norm": 0.1642701407220531, "learning_rate": 6.261649982740767e-06, "loss": 0.4502, "step": 2312 }, { "epoch": 2.1556383970177073, "grad_norm": 0.14245476530695303, "learning_rate": 6.254746289264757e-06, "loss": 0.4709, "step": 2313 }, { "epoch": 2.156570363466915, "grad_norm": 0.1478793457420215, "learning_rate": 6.247842595788748e-06, "loss": 0.4327, "step": 2314 }, { "epoch": 2.157502329916123, "grad_norm": 0.16421190444674638, "learning_rate": 6.240938902312738e-06, "loss": 0.4554, "step": 2315 }, { "epoch": 2.1584342963653307, "grad_norm": 0.16043300145047507, "learning_rate": 6.2340352088367286e-06, "loss": 0.458, "step": 2316 }, { "epoch": 2.1593662628145385, "grad_norm": 0.13932852079768368, "learning_rate": 6.2271315153607184e-06, "loss": 0.4142, "step": 2317 }, { "epoch": 2.1602982292637467, "grad_norm": 0.1378752824922135, "learning_rate": 6.220227821884709e-06, "loss": 0.4386, "step": 2318 }, { "epoch": 2.1612301957129545, "grad_norm": 0.1642376557210597, "learning_rate": 6.213324128408699e-06, "loss": 0.4225, "step": 2319 }, { "epoch": 2.1621621621621623, "grad_norm": 0.15395428548237444, "learning_rate": 6.20642043493269e-06, "loss": 0.4304, "step": 2320 }, { "epoch": 2.16309412861137, "grad_norm": 0.14998526269871695, "learning_rate": 6.1995167414566805e-06, "loss": 0.4516, "step": 2321 }, { "epoch": 2.164026095060578, "grad_norm": 0.14879996441392682, "learning_rate": 6.19261304798067e-06, "loss": 0.4851, "step": 2322 }, { "epoch": 2.1649580615097856, "grad_norm": 0.14760728874951307, "learning_rate": 6.185709354504661e-06, "loss": 0.4322, "step": 2323 }, { "epoch": 2.1658900279589934, "grad_norm": 0.14149232641043968, "learning_rate": 6.178805661028651e-06, "loss": 0.4337, "step": 2324 }, { "epoch": 2.1668219944082012, "grad_norm": 0.15194504293428052, "learning_rate": 6.171901967552642e-06, "loss": 0.4799, "step": 2325 }, { "epoch": 2.167753960857409, "grad_norm": 0.16226975055318046, "learning_rate": 6.164998274076631e-06, "loss": 0.4618, "step": 2326 }, { "epoch": 2.168685927306617, "grad_norm": 0.14329155225979315, "learning_rate": 6.158094580600622e-06, "loss": 0.4423, "step": 2327 }, { "epoch": 2.1696178937558246, "grad_norm": 0.14940755678045622, "learning_rate": 6.151190887124611e-06, "loss": 0.4413, "step": 2328 }, { "epoch": 2.170549860205033, "grad_norm": 0.14374518463386712, "learning_rate": 6.144287193648603e-06, "loss": 0.4549, "step": 2329 }, { "epoch": 2.1714818266542406, "grad_norm": 0.14274109656386041, "learning_rate": 6.137383500172592e-06, "loss": 0.4445, "step": 2330 }, { "epoch": 2.1724137931034484, "grad_norm": 0.1456906797049672, "learning_rate": 6.130479806696583e-06, "loss": 0.4385, "step": 2331 }, { "epoch": 2.173345759552656, "grad_norm": 0.16031943386106629, "learning_rate": 6.123576113220574e-06, "loss": 0.4478, "step": 2332 }, { "epoch": 2.174277726001864, "grad_norm": 0.14142362343075465, "learning_rate": 6.116672419744563e-06, "loss": 0.4393, "step": 2333 }, { "epoch": 2.1752096924510718, "grad_norm": 0.14656262283624436, "learning_rate": 6.109768726268555e-06, "loss": 0.4436, "step": 2334 }, { "epoch": 2.1761416589002796, "grad_norm": 0.14535155529593108, "learning_rate": 6.102865032792544e-06, "loss": 0.4238, "step": 2335 }, { "epoch": 2.1770736253494873, "grad_norm": 0.1326819113508509, "learning_rate": 6.095961339316535e-06, "loss": 0.4175, "step": 2336 }, { "epoch": 2.178005591798695, "grad_norm": 0.1494617727919484, "learning_rate": 6.089057645840525e-06, "loss": 0.4562, "step": 2337 }, { "epoch": 2.178937558247903, "grad_norm": 0.14397007266001113, "learning_rate": 6.082153952364515e-06, "loss": 0.436, "step": 2338 }, { "epoch": 2.1798695246971107, "grad_norm": 0.14697023182880753, "learning_rate": 6.075250258888505e-06, "loss": 0.4675, "step": 2339 }, { "epoch": 2.180801491146319, "grad_norm": 0.14891500055362383, "learning_rate": 6.068346565412496e-06, "loss": 0.4132, "step": 2340 }, { "epoch": 2.1817334575955267, "grad_norm": 0.1468092417935623, "learning_rate": 6.061442871936487e-06, "loss": 0.45, "step": 2341 }, { "epoch": 2.1826654240447345, "grad_norm": 0.15547934987194795, "learning_rate": 6.054539178460477e-06, "loss": 0.4749, "step": 2342 }, { "epoch": 2.1835973904939423, "grad_norm": 0.14698798572483088, "learning_rate": 6.047635484984467e-06, "loss": 0.4513, "step": 2343 }, { "epoch": 2.18452935694315, "grad_norm": 0.14068186990955073, "learning_rate": 6.040731791508457e-06, "loss": 0.4603, "step": 2344 }, { "epoch": 2.185461323392358, "grad_norm": 0.15400524210591604, "learning_rate": 6.033828098032448e-06, "loss": 0.4622, "step": 2345 }, { "epoch": 2.1863932898415657, "grad_norm": 0.14550124291637195, "learning_rate": 6.026924404556438e-06, "loss": 0.4351, "step": 2346 }, { "epoch": 2.1873252562907735, "grad_norm": 0.1403281432445701, "learning_rate": 6.0200207110804286e-06, "loss": 0.4325, "step": 2347 }, { "epoch": 2.1882572227399812, "grad_norm": 0.1431542872743696, "learning_rate": 6.0131170176044184e-06, "loss": 0.4073, "step": 2348 }, { "epoch": 2.189189189189189, "grad_norm": 0.1343356796953578, "learning_rate": 6.006213324128409e-06, "loss": 0.4073, "step": 2349 }, { "epoch": 2.190121155638397, "grad_norm": 0.14352689036464028, "learning_rate": 5.999309630652399e-06, "loss": 0.4508, "step": 2350 }, { "epoch": 2.191053122087605, "grad_norm": 0.148655615652439, "learning_rate": 5.99240593717639e-06, "loss": 0.436, "step": 2351 }, { "epoch": 2.191985088536813, "grad_norm": 0.16553946983176893, "learning_rate": 5.9855022437003805e-06, "loss": 0.4686, "step": 2352 }, { "epoch": 2.1929170549860206, "grad_norm": 0.13580507049001847, "learning_rate": 5.97859855022437e-06, "loss": 0.4483, "step": 2353 }, { "epoch": 2.1938490214352284, "grad_norm": 0.14553516099768501, "learning_rate": 5.971694856748361e-06, "loss": 0.4745, "step": 2354 }, { "epoch": 2.194780987884436, "grad_norm": 0.14444521873108943, "learning_rate": 5.964791163272351e-06, "loss": 0.4554, "step": 2355 }, { "epoch": 2.195712954333644, "grad_norm": 0.1449899968960626, "learning_rate": 5.957887469796342e-06, "loss": 0.4795, "step": 2356 }, { "epoch": 2.196644920782852, "grad_norm": 0.1378111318300483, "learning_rate": 5.950983776320332e-06, "loss": 0.444, "step": 2357 }, { "epoch": 2.1975768872320596, "grad_norm": 0.1555764742834487, "learning_rate": 5.944080082844322e-06, "loss": 0.4384, "step": 2358 }, { "epoch": 2.1985088536812674, "grad_norm": 0.13838318608691136, "learning_rate": 5.937176389368312e-06, "loss": 0.438, "step": 2359 }, { "epoch": 2.199440820130475, "grad_norm": 0.15710947850659882, "learning_rate": 5.930272695892303e-06, "loss": 0.4544, "step": 2360 }, { "epoch": 2.200372786579683, "grad_norm": 0.14058556704497951, "learning_rate": 5.923369002416293e-06, "loss": 0.417, "step": 2361 }, { "epoch": 2.201304753028891, "grad_norm": 0.1394628650643327, "learning_rate": 5.916465308940284e-06, "loss": 0.4219, "step": 2362 }, { "epoch": 2.202236719478099, "grad_norm": 0.14166638983080748, "learning_rate": 5.909561615464274e-06, "loss": 0.4259, "step": 2363 }, { "epoch": 2.2031686859273067, "grad_norm": 0.14176377103824164, "learning_rate": 5.902657921988264e-06, "loss": 0.4316, "step": 2364 }, { "epoch": 2.2041006523765145, "grad_norm": 0.14912240685856115, "learning_rate": 5.895754228512255e-06, "loss": 0.4591, "step": 2365 }, { "epoch": 2.2050326188257223, "grad_norm": 0.14987321641389395, "learning_rate": 5.888850535036245e-06, "loss": 0.435, "step": 2366 }, { "epoch": 2.20596458527493, "grad_norm": 0.14191181597989153, "learning_rate": 5.881946841560236e-06, "loss": 0.4308, "step": 2367 }, { "epoch": 2.206896551724138, "grad_norm": 0.14098602665861645, "learning_rate": 5.8750431480842255e-06, "loss": 0.4176, "step": 2368 }, { "epoch": 2.2078285181733457, "grad_norm": 0.1386336705706933, "learning_rate": 5.868139454608216e-06, "loss": 0.4326, "step": 2369 }, { "epoch": 2.2087604846225535, "grad_norm": 0.1429010743283423, "learning_rate": 5.861235761132206e-06, "loss": 0.4444, "step": 2370 }, { "epoch": 2.2096924510717613, "grad_norm": 0.16005514488471784, "learning_rate": 5.854332067656197e-06, "loss": 0.4347, "step": 2371 }, { "epoch": 2.210624417520969, "grad_norm": 0.1451502898336121, "learning_rate": 5.8474283741801875e-06, "loss": 0.4401, "step": 2372 }, { "epoch": 2.2115563839701773, "grad_norm": 0.13309879028784932, "learning_rate": 5.8405246807041774e-06, "loss": 0.4197, "step": 2373 }, { "epoch": 2.212488350419385, "grad_norm": 0.13868891421253698, "learning_rate": 5.833620987228168e-06, "loss": 0.4407, "step": 2374 }, { "epoch": 2.213420316868593, "grad_norm": 0.14374263304753954, "learning_rate": 5.826717293752158e-06, "loss": 0.4523, "step": 2375 }, { "epoch": 2.2143522833178007, "grad_norm": 0.14663670573554397, "learning_rate": 5.819813600276149e-06, "loss": 0.4619, "step": 2376 }, { "epoch": 2.2152842497670084, "grad_norm": 0.14051158467017968, "learning_rate": 5.812909906800138e-06, "loss": 0.4417, "step": 2377 }, { "epoch": 2.2162162162162162, "grad_norm": 0.1386557307471188, "learning_rate": 5.806006213324129e-06, "loss": 0.4308, "step": 2378 }, { "epoch": 2.217148182665424, "grad_norm": 0.14512879721318256, "learning_rate": 5.7991025198481184e-06, "loss": 0.4707, "step": 2379 }, { "epoch": 2.218080149114632, "grad_norm": 0.15006611853617327, "learning_rate": 5.79219882637211e-06, "loss": 0.435, "step": 2380 }, { "epoch": 2.2190121155638396, "grad_norm": 0.15015790067714413, "learning_rate": 5.785295132896099e-06, "loss": 0.4654, "step": 2381 }, { "epoch": 2.2199440820130474, "grad_norm": 0.13855956972143343, "learning_rate": 5.77839143942009e-06, "loss": 0.4208, "step": 2382 }, { "epoch": 2.220876048462255, "grad_norm": 0.14560705532644805, "learning_rate": 5.771487745944081e-06, "loss": 0.4175, "step": 2383 }, { "epoch": 2.2218080149114634, "grad_norm": 0.1483760457383962, "learning_rate": 5.76458405246807e-06, "loss": 0.4361, "step": 2384 }, { "epoch": 2.222739981360671, "grad_norm": 0.151473503041097, "learning_rate": 5.757680358992062e-06, "loss": 0.4178, "step": 2385 }, { "epoch": 2.223671947809879, "grad_norm": 0.15256652073052918, "learning_rate": 5.750776665516051e-06, "loss": 0.4433, "step": 2386 }, { "epoch": 2.2246039142590868, "grad_norm": 0.14407924519191917, "learning_rate": 5.743872972040042e-06, "loss": 0.4609, "step": 2387 }, { "epoch": 2.2255358807082946, "grad_norm": 0.13778297832637557, "learning_rate": 5.736969278564032e-06, "loss": 0.4362, "step": 2388 }, { "epoch": 2.2264678471575023, "grad_norm": 0.13799353100555878, "learning_rate": 5.730065585088022e-06, "loss": 0.4384, "step": 2389 }, { "epoch": 2.22739981360671, "grad_norm": 0.1585516580315646, "learning_rate": 5.723161891612012e-06, "loss": 0.4385, "step": 2390 }, { "epoch": 2.228331780055918, "grad_norm": 0.1637858009144823, "learning_rate": 5.716258198136003e-06, "loss": 0.4626, "step": 2391 }, { "epoch": 2.2292637465051257, "grad_norm": 0.15976701076548838, "learning_rate": 5.709354504659994e-06, "loss": 0.4642, "step": 2392 }, { "epoch": 2.2301957129543335, "grad_norm": 0.1440156300593649, "learning_rate": 5.702450811183984e-06, "loss": 0.4732, "step": 2393 }, { "epoch": 2.2311276794035413, "grad_norm": 0.144814459067917, "learning_rate": 5.695547117707974e-06, "loss": 0.4457, "step": 2394 }, { "epoch": 2.2320596458527495, "grad_norm": 0.1356581341506248, "learning_rate": 5.688643424231964e-06, "loss": 0.4319, "step": 2395 }, { "epoch": 2.2329916123019573, "grad_norm": 0.138425031489006, "learning_rate": 5.681739730755955e-06, "loss": 0.4339, "step": 2396 }, { "epoch": 2.233923578751165, "grad_norm": 0.14443485545178206, "learning_rate": 5.674836037279945e-06, "loss": 0.4308, "step": 2397 }, { "epoch": 2.234855545200373, "grad_norm": 0.14435294119432662, "learning_rate": 5.667932343803936e-06, "loss": 0.4523, "step": 2398 }, { "epoch": 2.2357875116495807, "grad_norm": 0.13560873375526075, "learning_rate": 5.6610286503279255e-06, "loss": 0.4283, "step": 2399 }, { "epoch": 2.2367194780987885, "grad_norm": 0.14534364104016326, "learning_rate": 5.654124956851916e-06, "loss": 0.4464, "step": 2400 }, { "epoch": 2.2376514445479962, "grad_norm": 0.14120850640775753, "learning_rate": 5.647221263375906e-06, "loss": 0.4393, "step": 2401 }, { "epoch": 2.238583410997204, "grad_norm": 0.13496227522401547, "learning_rate": 5.640317569899897e-06, "loss": 0.4084, "step": 2402 }, { "epoch": 2.239515377446412, "grad_norm": 0.13807599004435767, "learning_rate": 5.6334138764238875e-06, "loss": 0.4133, "step": 2403 }, { "epoch": 2.2404473438956196, "grad_norm": 0.13327379476811504, "learning_rate": 5.6265101829478774e-06, "loss": 0.4304, "step": 2404 }, { "epoch": 2.2413793103448274, "grad_norm": 0.13616822425339617, "learning_rate": 5.619606489471868e-06, "loss": 0.4291, "step": 2405 }, { "epoch": 2.2423112767940356, "grad_norm": 0.14477070886782714, "learning_rate": 5.612702795995858e-06, "loss": 0.4585, "step": 2406 }, { "epoch": 2.2432432432432434, "grad_norm": 0.13410160896674192, "learning_rate": 5.605799102519849e-06, "loss": 0.4382, "step": 2407 }, { "epoch": 2.244175209692451, "grad_norm": 0.14912737841070708, "learning_rate": 5.598895409043839e-06, "loss": 0.4698, "step": 2408 }, { "epoch": 2.245107176141659, "grad_norm": 0.1428557827438842, "learning_rate": 5.591991715567829e-06, "loss": 0.4369, "step": 2409 }, { "epoch": 2.246039142590867, "grad_norm": 0.14453522643825822, "learning_rate": 5.585088022091819e-06, "loss": 0.458, "step": 2410 }, { "epoch": 2.2469711090400746, "grad_norm": 0.1348454529130041, "learning_rate": 5.57818432861581e-06, "loss": 0.4546, "step": 2411 }, { "epoch": 2.2479030754892824, "grad_norm": 0.13519364109848386, "learning_rate": 5.571280635139801e-06, "loss": 0.4207, "step": 2412 }, { "epoch": 2.24883504193849, "grad_norm": 0.13983867772021102, "learning_rate": 5.564376941663791e-06, "loss": 0.4553, "step": 2413 }, { "epoch": 2.249767008387698, "grad_norm": 0.14707138748694915, "learning_rate": 5.557473248187781e-06, "loss": 0.447, "step": 2414 }, { "epoch": 2.2506989748369057, "grad_norm": 0.1396793863120737, "learning_rate": 5.550569554711771e-06, "loss": 0.452, "step": 2415 }, { "epoch": 2.2516309412861135, "grad_norm": 0.14452557318936682, "learning_rate": 5.543665861235762e-06, "loss": 0.4445, "step": 2416 }, { "epoch": 2.2525629077353218, "grad_norm": 0.13253258065601084, "learning_rate": 5.536762167759752e-06, "loss": 0.4552, "step": 2417 }, { "epoch": 2.2534948741845295, "grad_norm": 0.1380097657380813, "learning_rate": 5.529858474283743e-06, "loss": 0.4663, "step": 2418 }, { "epoch": 2.2544268406337373, "grad_norm": 0.13426083664029584, "learning_rate": 5.5229547808077325e-06, "loss": 0.4267, "step": 2419 }, { "epoch": 2.255358807082945, "grad_norm": 0.14155006973991185, "learning_rate": 5.516051087331723e-06, "loss": 0.4351, "step": 2420 }, { "epoch": 2.256290773532153, "grad_norm": 0.14075986134317905, "learning_rate": 5.509147393855713e-06, "loss": 0.446, "step": 2421 }, { "epoch": 2.2572227399813607, "grad_norm": 0.14368515527883582, "learning_rate": 5.502243700379704e-06, "loss": 0.4451, "step": 2422 }, { "epoch": 2.2581547064305685, "grad_norm": 0.13772391557691688, "learning_rate": 5.4953400069036946e-06, "loss": 0.4387, "step": 2423 }, { "epoch": 2.2590866728797763, "grad_norm": 0.1387198203961118, "learning_rate": 5.4884363134276845e-06, "loss": 0.4431, "step": 2424 }, { "epoch": 2.260018639328984, "grad_norm": 0.1450813336471907, "learning_rate": 5.481532619951675e-06, "loss": 0.4537, "step": 2425 }, { "epoch": 2.260950605778192, "grad_norm": 0.14477871663946604, "learning_rate": 5.474628926475665e-06, "loss": 0.4409, "step": 2426 }, { "epoch": 2.2618825722273996, "grad_norm": 0.1346449352769592, "learning_rate": 5.467725232999656e-06, "loss": 0.4357, "step": 2427 }, { "epoch": 2.262814538676608, "grad_norm": 0.1440591361907592, "learning_rate": 5.460821539523645e-06, "loss": 0.4492, "step": 2428 }, { "epoch": 2.2637465051258157, "grad_norm": 0.14147726740461838, "learning_rate": 5.4539178460476364e-06, "loss": 0.4342, "step": 2429 }, { "epoch": 2.2646784715750234, "grad_norm": 0.1439051068682442, "learning_rate": 5.4470141525716255e-06, "loss": 0.4403, "step": 2430 }, { "epoch": 2.2656104380242312, "grad_norm": 0.13433276557349597, "learning_rate": 5.440110459095617e-06, "loss": 0.4011, "step": 2431 }, { "epoch": 2.266542404473439, "grad_norm": 0.14447300669933041, "learning_rate": 5.433206765619608e-06, "loss": 0.4713, "step": 2432 }, { "epoch": 2.267474370922647, "grad_norm": 0.13846269130788838, "learning_rate": 5.426303072143597e-06, "loss": 0.4445, "step": 2433 }, { "epoch": 2.2684063373718546, "grad_norm": 0.14775982963941223, "learning_rate": 5.419399378667588e-06, "loss": 0.4486, "step": 2434 }, { "epoch": 2.2693383038210624, "grad_norm": 0.15083512927195533, "learning_rate": 5.4124956851915774e-06, "loss": 0.4681, "step": 2435 }, { "epoch": 2.27027027027027, "grad_norm": 0.13725076696361355, "learning_rate": 5.405591991715568e-06, "loss": 0.4295, "step": 2436 }, { "epoch": 2.271202236719478, "grad_norm": 0.13666755136532482, "learning_rate": 5.398688298239558e-06, "loss": 0.4426, "step": 2437 }, { "epoch": 2.2721342031686858, "grad_norm": 0.14157939072273995, "learning_rate": 5.391784604763549e-06, "loss": 0.4555, "step": 2438 }, { "epoch": 2.273066169617894, "grad_norm": 0.14158023379174134, "learning_rate": 5.384880911287539e-06, "loss": 0.4117, "step": 2439 }, { "epoch": 2.2739981360671018, "grad_norm": 0.15251019652562434, "learning_rate": 5.377977217811529e-06, "loss": 0.4624, "step": 2440 }, { "epoch": 2.2749301025163096, "grad_norm": 0.133792890322324, "learning_rate": 5.371073524335519e-06, "loss": 0.4385, "step": 2441 }, { "epoch": 2.2758620689655173, "grad_norm": 0.1449793819549722, "learning_rate": 5.36416983085951e-06, "loss": 0.4533, "step": 2442 }, { "epoch": 2.276794035414725, "grad_norm": 0.13849204341335888, "learning_rate": 5.357266137383501e-06, "loss": 0.4406, "step": 2443 }, { "epoch": 2.277726001863933, "grad_norm": 0.1498791825395937, "learning_rate": 5.350362443907491e-06, "loss": 0.4675, "step": 2444 }, { "epoch": 2.2786579683131407, "grad_norm": 0.14782971943831708, "learning_rate": 5.343458750431481e-06, "loss": 0.4335, "step": 2445 }, { "epoch": 2.2795899347623485, "grad_norm": 0.14709824725914553, "learning_rate": 5.336555056955471e-06, "loss": 0.4414, "step": 2446 }, { "epoch": 2.2805219012115563, "grad_norm": 0.1429956095542674, "learning_rate": 5.329651363479462e-06, "loss": 0.4477, "step": 2447 }, { "epoch": 2.281453867660764, "grad_norm": 0.1500452803590275, "learning_rate": 5.322747670003452e-06, "loss": 0.4428, "step": 2448 }, { "epoch": 2.282385834109972, "grad_norm": 0.14126783035060697, "learning_rate": 5.315843976527443e-06, "loss": 0.4479, "step": 2449 }, { "epoch": 2.28331780055918, "grad_norm": 0.1514463425549868, "learning_rate": 5.3089402830514325e-06, "loss": 0.4533, "step": 2450 }, { "epoch": 2.284249767008388, "grad_norm": 0.14467165765850343, "learning_rate": 5.302036589575423e-06, "loss": 0.4605, "step": 2451 }, { "epoch": 2.2851817334575957, "grad_norm": 0.14540261713275993, "learning_rate": 5.295132896099414e-06, "loss": 0.4593, "step": 2452 }, { "epoch": 2.2861136999068035, "grad_norm": 0.13855475290746125, "learning_rate": 5.288229202623404e-06, "loss": 0.4315, "step": 2453 }, { "epoch": 2.2870456663560113, "grad_norm": 0.1494565089743376, "learning_rate": 5.281325509147395e-06, "loss": 0.4364, "step": 2454 }, { "epoch": 2.287977632805219, "grad_norm": 0.15475397171768734, "learning_rate": 5.2744218156713845e-06, "loss": 0.4518, "step": 2455 }, { "epoch": 2.288909599254427, "grad_norm": 0.1677281689149056, "learning_rate": 5.267518122195375e-06, "loss": 0.4604, "step": 2456 }, { "epoch": 2.2898415657036346, "grad_norm": 0.14392741254548513, "learning_rate": 5.260614428719365e-06, "loss": 0.437, "step": 2457 }, { "epoch": 2.2907735321528424, "grad_norm": 0.14599230279308506, "learning_rate": 5.253710735243356e-06, "loss": 0.4393, "step": 2458 }, { "epoch": 2.29170549860205, "grad_norm": 0.1422738159786065, "learning_rate": 5.246807041767346e-06, "loss": 0.4389, "step": 2459 }, { "epoch": 2.292637465051258, "grad_norm": 0.15307462296264368, "learning_rate": 5.2399033482913364e-06, "loss": 0.4541, "step": 2460 }, { "epoch": 2.293569431500466, "grad_norm": 0.16161064382828316, "learning_rate": 5.232999654815326e-06, "loss": 0.4488, "step": 2461 }, { "epoch": 2.294501397949674, "grad_norm": 0.14727465975956644, "learning_rate": 5.226095961339317e-06, "loss": 0.4378, "step": 2462 }, { "epoch": 2.295433364398882, "grad_norm": 0.14101649668826569, "learning_rate": 5.219192267863308e-06, "loss": 0.4356, "step": 2463 }, { "epoch": 2.2963653308480896, "grad_norm": 0.13878457108034026, "learning_rate": 5.212288574387298e-06, "loss": 0.4374, "step": 2464 }, { "epoch": 2.2972972972972974, "grad_norm": 0.1402962157845132, "learning_rate": 5.205384880911288e-06, "loss": 0.4369, "step": 2465 }, { "epoch": 2.298229263746505, "grad_norm": 0.14499260327394362, "learning_rate": 5.198481187435278e-06, "loss": 0.4445, "step": 2466 }, { "epoch": 2.299161230195713, "grad_norm": 0.1535126514292872, "learning_rate": 5.191577493959269e-06, "loss": 0.4382, "step": 2467 }, { "epoch": 2.3000931966449207, "grad_norm": 0.14037451098773326, "learning_rate": 5.184673800483259e-06, "loss": 0.4364, "step": 2468 }, { "epoch": 2.3010251630941285, "grad_norm": 0.13516556839817284, "learning_rate": 5.17777010700725e-06, "loss": 0.4102, "step": 2469 }, { "epoch": 2.3019571295433363, "grad_norm": 0.16563179012357604, "learning_rate": 5.1708664135312395e-06, "loss": 0.4749, "step": 2470 }, { "epoch": 2.302889095992544, "grad_norm": 0.1409535336576492, "learning_rate": 5.16396272005523e-06, "loss": 0.4662, "step": 2471 }, { "epoch": 2.3038210624417523, "grad_norm": 0.15394314654903593, "learning_rate": 5.15705902657922e-06, "loss": 0.4448, "step": 2472 }, { "epoch": 2.3047530288909597, "grad_norm": 0.162539064741474, "learning_rate": 5.150155333103211e-06, "loss": 0.4632, "step": 2473 }, { "epoch": 2.305684995340168, "grad_norm": 0.13409472215190157, "learning_rate": 5.143251639627202e-06, "loss": 0.4225, "step": 2474 }, { "epoch": 2.3066169617893757, "grad_norm": 0.1408656802755133, "learning_rate": 5.1363479461511915e-06, "loss": 0.443, "step": 2475 }, { "epoch": 2.3075489282385835, "grad_norm": 0.14015806781911216, "learning_rate": 5.129444252675182e-06, "loss": 0.4198, "step": 2476 }, { "epoch": 2.3084808946877913, "grad_norm": 0.14642156553037197, "learning_rate": 5.122540559199172e-06, "loss": 0.4443, "step": 2477 }, { "epoch": 2.309412861136999, "grad_norm": 0.14411617760749784, "learning_rate": 5.115636865723163e-06, "loss": 0.4478, "step": 2478 }, { "epoch": 2.310344827586207, "grad_norm": 0.13267860390937386, "learning_rate": 5.108733172247152e-06, "loss": 0.4167, "step": 2479 }, { "epoch": 2.3112767940354146, "grad_norm": 0.13906158759218998, "learning_rate": 5.1018294787711435e-06, "loss": 0.4447, "step": 2480 }, { "epoch": 2.3122087604846224, "grad_norm": 0.14515096218450008, "learning_rate": 5.0949257852951325e-06, "loss": 0.4379, "step": 2481 }, { "epoch": 2.31314072693383, "grad_norm": 0.1458332222899843, "learning_rate": 5.088022091819124e-06, "loss": 0.4784, "step": 2482 }, { "epoch": 2.3140726933830384, "grad_norm": 0.1402721734005814, "learning_rate": 5.081118398343115e-06, "loss": 0.4498, "step": 2483 }, { "epoch": 2.315004659832246, "grad_norm": 0.13999832432794482, "learning_rate": 5.074214704867104e-06, "loss": 0.4341, "step": 2484 }, { "epoch": 2.315936626281454, "grad_norm": 0.13749248354867974, "learning_rate": 5.0673110113910954e-06, "loss": 0.427, "step": 2485 }, { "epoch": 2.316868592730662, "grad_norm": 0.13535140834349618, "learning_rate": 5.0604073179150845e-06, "loss": 0.4241, "step": 2486 }, { "epoch": 2.3178005591798696, "grad_norm": 0.14253785152202847, "learning_rate": 5.053503624439075e-06, "loss": 0.4557, "step": 2487 }, { "epoch": 2.3187325256290774, "grad_norm": 0.13425206300195344, "learning_rate": 5.046599930963065e-06, "loss": 0.4347, "step": 2488 }, { "epoch": 2.319664492078285, "grad_norm": 0.14670753333353703, "learning_rate": 5.039696237487056e-06, "loss": 0.4513, "step": 2489 }, { "epoch": 2.320596458527493, "grad_norm": 0.1404819571137092, "learning_rate": 5.032792544011046e-06, "loss": 0.4189, "step": 2490 }, { "epoch": 2.3215284249767008, "grad_norm": 0.14542708228273318, "learning_rate": 5.0258888505350364e-06, "loss": 0.4624, "step": 2491 }, { "epoch": 2.3224603914259085, "grad_norm": 0.1457965070477338, "learning_rate": 5.018985157059026e-06, "loss": 0.4717, "step": 2492 }, { "epoch": 2.3233923578751163, "grad_norm": 0.1519283588991875, "learning_rate": 5.012081463583017e-06, "loss": 0.462, "step": 2493 }, { "epoch": 2.3243243243243246, "grad_norm": 0.1459960001718831, "learning_rate": 5.005177770107008e-06, "loss": 0.4438, "step": 2494 }, { "epoch": 2.325256290773532, "grad_norm": 0.13835702396341965, "learning_rate": 4.998274076630998e-06, "loss": 0.4644, "step": 2495 }, { "epoch": 2.32618825722274, "grad_norm": 0.22886170064860992, "learning_rate": 4.991370383154988e-06, "loss": 0.4452, "step": 2496 }, { "epoch": 2.327120223671948, "grad_norm": 0.13457375631898286, "learning_rate": 4.984466689678978e-06, "loss": 0.4289, "step": 2497 }, { "epoch": 2.3280521901211557, "grad_norm": 0.14494086026352218, "learning_rate": 4.977562996202969e-06, "loss": 0.466, "step": 2498 }, { "epoch": 2.3289841565703635, "grad_norm": 0.1382049701353014, "learning_rate": 4.97065930272696e-06, "loss": 0.4387, "step": 2499 }, { "epoch": 2.3299161230195713, "grad_norm": 0.1344562616896363, "learning_rate": 4.96375560925095e-06, "loss": 0.4295, "step": 2500 }, { "epoch": 2.330848089468779, "grad_norm": 0.1423423872409791, "learning_rate": 4.95685191577494e-06, "loss": 0.467, "step": 2501 }, { "epoch": 2.331780055917987, "grad_norm": 0.13676260410442045, "learning_rate": 4.94994822229893e-06, "loss": 0.4206, "step": 2502 }, { "epoch": 2.3327120223671947, "grad_norm": 0.13826139854706326, "learning_rate": 4.943044528822921e-06, "loss": 0.4423, "step": 2503 }, { "epoch": 2.3336439888164024, "grad_norm": 0.13814344125647973, "learning_rate": 4.936140835346911e-06, "loss": 0.4225, "step": 2504 }, { "epoch": 2.3345759552656107, "grad_norm": 0.1477632585822533, "learning_rate": 4.929237141870902e-06, "loss": 0.4779, "step": 2505 }, { "epoch": 2.335507921714818, "grad_norm": 0.1440980223803276, "learning_rate": 4.9223334483948915e-06, "loss": 0.441, "step": 2506 }, { "epoch": 2.3364398881640263, "grad_norm": 0.13957466700118382, "learning_rate": 4.915429754918881e-06, "loss": 0.451, "step": 2507 }, { "epoch": 2.337371854613234, "grad_norm": 0.14443976387618912, "learning_rate": 4.908526061442873e-06, "loss": 0.4617, "step": 2508 }, { "epoch": 2.338303821062442, "grad_norm": 0.14289239048572625, "learning_rate": 4.901622367966863e-06, "loss": 0.481, "step": 2509 }, { "epoch": 2.3392357875116496, "grad_norm": 0.1468157654229785, "learning_rate": 4.894718674490853e-06, "loss": 0.4653, "step": 2510 }, { "epoch": 2.3401677539608574, "grad_norm": 0.1437764153110562, "learning_rate": 4.8878149810148435e-06, "loss": 0.4471, "step": 2511 }, { "epoch": 2.341099720410065, "grad_norm": 0.14443389559279157, "learning_rate": 4.880911287538833e-06, "loss": 0.4378, "step": 2512 }, { "epoch": 2.342031686859273, "grad_norm": 0.13743982121188406, "learning_rate": 4.874007594062824e-06, "loss": 0.4491, "step": 2513 }, { "epoch": 2.3429636533084808, "grad_norm": 0.13963954883572852, "learning_rate": 4.867103900586814e-06, "loss": 0.4435, "step": 2514 }, { "epoch": 2.3438956197576886, "grad_norm": 0.14796738090169592, "learning_rate": 4.860200207110805e-06, "loss": 0.4758, "step": 2515 }, { "epoch": 2.344827586206897, "grad_norm": 0.14286240762564845, "learning_rate": 4.853296513634795e-06, "loss": 0.4392, "step": 2516 }, { "epoch": 2.345759552656104, "grad_norm": 0.14485745156616747, "learning_rate": 4.846392820158785e-06, "loss": 0.4481, "step": 2517 }, { "epoch": 2.3466915191053124, "grad_norm": 0.14122670888076677, "learning_rate": 4.839489126682776e-06, "loss": 0.4406, "step": 2518 }, { "epoch": 2.34762348555452, "grad_norm": 0.13869407495272376, "learning_rate": 4.832585433206766e-06, "loss": 0.4585, "step": 2519 }, { "epoch": 2.348555452003728, "grad_norm": 0.13983252544858904, "learning_rate": 4.825681739730757e-06, "loss": 0.4483, "step": 2520 }, { "epoch": 2.3494874184529357, "grad_norm": 0.1349453459332325, "learning_rate": 4.8187780462547465e-06, "loss": 0.4344, "step": 2521 }, { "epoch": 2.3504193849021435, "grad_norm": 0.13528941501160405, "learning_rate": 4.811874352778737e-06, "loss": 0.44, "step": 2522 }, { "epoch": 2.3513513513513513, "grad_norm": 0.14651926214838581, "learning_rate": 4.804970659302727e-06, "loss": 0.4342, "step": 2523 }, { "epoch": 2.352283317800559, "grad_norm": 0.13813811561235, "learning_rate": 4.798066965826718e-06, "loss": 0.4514, "step": 2524 }, { "epoch": 2.353215284249767, "grad_norm": 0.14108345597920094, "learning_rate": 4.791163272350708e-06, "loss": 0.4546, "step": 2525 }, { "epoch": 2.3541472506989747, "grad_norm": 0.14929552355398273, "learning_rate": 4.7842595788746985e-06, "loss": 0.464, "step": 2526 }, { "epoch": 2.355079217148183, "grad_norm": 0.14350738294521936, "learning_rate": 4.777355885398688e-06, "loss": 0.4567, "step": 2527 }, { "epoch": 2.3560111835973903, "grad_norm": 0.13869603968823485, "learning_rate": 4.770452191922679e-06, "loss": 0.4324, "step": 2528 }, { "epoch": 2.3569431500465985, "grad_norm": 0.1318204465917944, "learning_rate": 4.76354849844667e-06, "loss": 0.4455, "step": 2529 }, { "epoch": 2.3578751164958063, "grad_norm": 0.1446804052822308, "learning_rate": 4.75664480497066e-06, "loss": 0.4523, "step": 2530 }, { "epoch": 2.358807082945014, "grad_norm": 0.14407499286631367, "learning_rate": 4.7497411114946505e-06, "loss": 0.4611, "step": 2531 }, { "epoch": 2.359739049394222, "grad_norm": 0.14647239613544985, "learning_rate": 4.74283741801864e-06, "loss": 0.4449, "step": 2532 }, { "epoch": 2.3606710158434296, "grad_norm": 0.1373510835467715, "learning_rate": 4.73593372454263e-06, "loss": 0.4274, "step": 2533 }, { "epoch": 2.3616029822926374, "grad_norm": 0.13776116534379013, "learning_rate": 4.729030031066621e-06, "loss": 0.4493, "step": 2534 }, { "epoch": 2.362534948741845, "grad_norm": 0.14530381169736004, "learning_rate": 4.722126337590611e-06, "loss": 0.4396, "step": 2535 }, { "epoch": 2.363466915191053, "grad_norm": 0.13654803189308087, "learning_rate": 4.715222644114602e-06, "loss": 0.4251, "step": 2536 }, { "epoch": 2.364398881640261, "grad_norm": 0.1403222734736576, "learning_rate": 4.7083189506385915e-06, "loss": 0.4527, "step": 2537 }, { "epoch": 2.3653308480894686, "grad_norm": 0.14036878256147975, "learning_rate": 4.701415257162582e-06, "loss": 0.4529, "step": 2538 }, { "epoch": 2.3662628145386764, "grad_norm": 0.13344703717607087, "learning_rate": 4.694511563686573e-06, "loss": 0.4109, "step": 2539 }, { "epoch": 2.3671947809878846, "grad_norm": 0.13883781259348218, "learning_rate": 4.687607870210563e-06, "loss": 0.4237, "step": 2540 }, { "epoch": 2.3681267474370924, "grad_norm": 0.1435882302696084, "learning_rate": 4.6807041767345536e-06, "loss": 0.4269, "step": 2541 }, { "epoch": 2.3690587138863, "grad_norm": 0.1400238761755143, "learning_rate": 4.6738004832585435e-06, "loss": 0.4498, "step": 2542 }, { "epoch": 2.369990680335508, "grad_norm": 0.1659956042387449, "learning_rate": 4.666896789782534e-06, "loss": 0.4941, "step": 2543 }, { "epoch": 2.3709226467847158, "grad_norm": 0.15033664083190185, "learning_rate": 4.659993096306524e-06, "loss": 0.4585, "step": 2544 }, { "epoch": 2.3718546132339235, "grad_norm": 0.1366284169303317, "learning_rate": 4.653089402830515e-06, "loss": 0.4363, "step": 2545 }, { "epoch": 2.3727865796831313, "grad_norm": 0.14155984455684384, "learning_rate": 4.646185709354505e-06, "loss": 0.4558, "step": 2546 }, { "epoch": 2.373718546132339, "grad_norm": 0.15569616836461928, "learning_rate": 4.6392820158784954e-06, "loss": 0.4368, "step": 2547 }, { "epoch": 2.374650512581547, "grad_norm": 0.13241782597058288, "learning_rate": 4.632378322402486e-06, "loss": 0.4161, "step": 2548 }, { "epoch": 2.3755824790307547, "grad_norm": 0.1463554984423796, "learning_rate": 4.625474628926476e-06, "loss": 0.4277, "step": 2549 }, { "epoch": 2.3765144454799625, "grad_norm": 0.13429097975554866, "learning_rate": 4.618570935450467e-06, "loss": 0.4302, "step": 2550 }, { "epoch": 2.3774464119291707, "grad_norm": 0.1343408954543962, "learning_rate": 4.611667241974457e-06, "loss": 0.4453, "step": 2551 }, { "epoch": 2.3783783783783785, "grad_norm": 0.13462627478867814, "learning_rate": 4.604763548498447e-06, "loss": 0.4188, "step": 2552 }, { "epoch": 2.3793103448275863, "grad_norm": 0.1364237106863653, "learning_rate": 4.597859855022437e-06, "loss": 0.443, "step": 2553 }, { "epoch": 2.380242311276794, "grad_norm": 0.14314107175928523, "learning_rate": 4.590956161546428e-06, "loss": 0.4328, "step": 2554 }, { "epoch": 2.381174277726002, "grad_norm": 0.1379690141994017, "learning_rate": 4.584052468070418e-06, "loss": 0.4284, "step": 2555 }, { "epoch": 2.3821062441752097, "grad_norm": 0.13476913917825167, "learning_rate": 4.577148774594408e-06, "loss": 0.4415, "step": 2556 }, { "epoch": 2.3830382106244175, "grad_norm": 0.14196606654987468, "learning_rate": 4.5702450811183985e-06, "loss": 0.4373, "step": 2557 }, { "epoch": 2.3839701770736252, "grad_norm": 0.13742270169993337, "learning_rate": 4.563341387642389e-06, "loss": 0.437, "step": 2558 }, { "epoch": 2.384902143522833, "grad_norm": 0.14413103142601527, "learning_rate": 4.55643769416638e-06, "loss": 0.4629, "step": 2559 }, { "epoch": 2.385834109972041, "grad_norm": 0.14402588730441293, "learning_rate": 4.54953400069037e-06, "loss": 0.4247, "step": 2560 }, { "epoch": 2.3867660764212486, "grad_norm": 0.14744672125164146, "learning_rate": 4.54263030721436e-06, "loss": 0.4655, "step": 2561 }, { "epoch": 2.387698042870457, "grad_norm": 0.14850810783727464, "learning_rate": 4.5357266137383505e-06, "loss": 0.4473, "step": 2562 }, { "epoch": 2.3886300093196646, "grad_norm": 0.1514078089266513, "learning_rate": 4.52882292026234e-06, "loss": 0.462, "step": 2563 }, { "epoch": 2.3895619757688724, "grad_norm": 0.14090962087373368, "learning_rate": 4.521919226786331e-06, "loss": 0.4601, "step": 2564 }, { "epoch": 2.39049394221808, "grad_norm": 0.14778493931217224, "learning_rate": 4.515015533310321e-06, "loss": 0.4564, "step": 2565 }, { "epoch": 2.391425908667288, "grad_norm": 0.13440173781331843, "learning_rate": 4.508111839834312e-06, "loss": 0.4301, "step": 2566 }, { "epoch": 2.392357875116496, "grad_norm": 0.13640317944988714, "learning_rate": 4.501208146358302e-06, "loss": 0.4528, "step": 2567 }, { "epoch": 2.3932898415657036, "grad_norm": 0.14412340175547855, "learning_rate": 4.494304452882292e-06, "loss": 0.4348, "step": 2568 }, { "epoch": 2.3942218080149114, "grad_norm": 0.1556851691384956, "learning_rate": 4.487400759406283e-06, "loss": 0.4694, "step": 2569 }, { "epoch": 2.395153774464119, "grad_norm": 0.14960781194159317, "learning_rate": 4.480497065930273e-06, "loss": 0.4558, "step": 2570 }, { "epoch": 2.396085740913327, "grad_norm": 0.1412422988204557, "learning_rate": 4.473593372454264e-06, "loss": 0.4275, "step": 2571 }, { "epoch": 2.3970177073625347, "grad_norm": 0.1476266843201053, "learning_rate": 4.466689678978254e-06, "loss": 0.4625, "step": 2572 }, { "epoch": 2.397949673811743, "grad_norm": 0.143489008093868, "learning_rate": 4.459785985502244e-06, "loss": 0.4463, "step": 2573 }, { "epoch": 2.3988816402609507, "grad_norm": 0.14616396806521825, "learning_rate": 4.452882292026234e-06, "loss": 0.4443, "step": 2574 }, { "epoch": 2.3998136067101585, "grad_norm": 0.13532403753797106, "learning_rate": 4.445978598550225e-06, "loss": 0.4234, "step": 2575 }, { "epoch": 2.4007455731593663, "grad_norm": 0.14039591803510362, "learning_rate": 4.439074905074215e-06, "loss": 0.4513, "step": 2576 }, { "epoch": 2.401677539608574, "grad_norm": 0.13469021195641054, "learning_rate": 4.4321712115982055e-06, "loss": 0.4454, "step": 2577 }, { "epoch": 2.402609506057782, "grad_norm": 0.1434119003179074, "learning_rate": 4.4252675181221954e-06, "loss": 0.446, "step": 2578 }, { "epoch": 2.4035414725069897, "grad_norm": 0.14745339604612687, "learning_rate": 4.418363824646186e-06, "loss": 0.4433, "step": 2579 }, { "epoch": 2.4044734389561975, "grad_norm": 0.13720575392112916, "learning_rate": 4.411460131170177e-06, "loss": 0.4133, "step": 2580 }, { "epoch": 2.4054054054054053, "grad_norm": 0.1476687811329784, "learning_rate": 4.404556437694167e-06, "loss": 0.4624, "step": 2581 }, { "epoch": 2.406337371854613, "grad_norm": 0.14276050139804097, "learning_rate": 4.3976527442181575e-06, "loss": 0.4669, "step": 2582 }, { "epoch": 2.407269338303821, "grad_norm": 0.1504853233203735, "learning_rate": 4.390749050742147e-06, "loss": 0.4657, "step": 2583 }, { "epoch": 2.408201304753029, "grad_norm": 0.149942004806494, "learning_rate": 4.383845357266137e-06, "loss": 0.4838, "step": 2584 }, { "epoch": 2.409133271202237, "grad_norm": 0.14078741044700727, "learning_rate": 4.376941663790128e-06, "loss": 0.4452, "step": 2585 }, { "epoch": 2.4100652376514446, "grad_norm": 0.13486396211830926, "learning_rate": 4.370037970314118e-06, "loss": 0.4277, "step": 2586 }, { "epoch": 2.4109972041006524, "grad_norm": 0.14706826980031334, "learning_rate": 4.363134276838109e-06, "loss": 0.4539, "step": 2587 }, { "epoch": 2.4119291705498602, "grad_norm": 0.13494795769581264, "learning_rate": 4.3562305833620985e-06, "loss": 0.4237, "step": 2588 }, { "epoch": 2.412861136999068, "grad_norm": 0.14838659866048146, "learning_rate": 4.349326889886089e-06, "loss": 0.4447, "step": 2589 }, { "epoch": 2.413793103448276, "grad_norm": 0.13419557918381458, "learning_rate": 4.34242319641008e-06, "loss": 0.4325, "step": 2590 }, { "epoch": 2.4147250698974836, "grad_norm": 0.1315256280118296, "learning_rate": 4.33551950293407e-06, "loss": 0.4258, "step": 2591 }, { "epoch": 2.4156570363466914, "grad_norm": 0.1404886005329526, "learning_rate": 4.328615809458061e-06, "loss": 0.4389, "step": 2592 }, { "epoch": 2.416589002795899, "grad_norm": 0.16671019312126836, "learning_rate": 4.3217121159820505e-06, "loss": 0.4637, "step": 2593 }, { "epoch": 2.417520969245107, "grad_norm": 0.14039892534387563, "learning_rate": 4.314808422506041e-06, "loss": 0.4509, "step": 2594 }, { "epoch": 2.418452935694315, "grad_norm": 0.1409151433228989, "learning_rate": 4.307904729030031e-06, "loss": 0.4521, "step": 2595 }, { "epoch": 2.419384902143523, "grad_norm": 0.14203036489817392, "learning_rate": 4.301001035554022e-06, "loss": 0.4709, "step": 2596 }, { "epoch": 2.4203168685927308, "grad_norm": 0.1452897864546534, "learning_rate": 4.294097342078012e-06, "loss": 0.4658, "step": 2597 }, { "epoch": 2.4212488350419386, "grad_norm": 0.14485943938687806, "learning_rate": 4.2871936486020025e-06, "loss": 0.4514, "step": 2598 }, { "epoch": 2.4221808014911463, "grad_norm": 0.13375279352619188, "learning_rate": 4.280289955125993e-06, "loss": 0.4188, "step": 2599 }, { "epoch": 2.423112767940354, "grad_norm": 0.13598754344454694, "learning_rate": 4.273386261649983e-06, "loss": 0.431, "step": 2600 }, { "epoch": 2.424044734389562, "grad_norm": 0.13968062369464648, "learning_rate": 4.266482568173974e-06, "loss": 0.4467, "step": 2601 }, { "epoch": 2.4249767008387697, "grad_norm": 0.13645464513585856, "learning_rate": 4.259578874697964e-06, "loss": 0.4503, "step": 2602 }, { "epoch": 2.4259086672879775, "grad_norm": 0.14428458843165035, "learning_rate": 4.2526751812219544e-06, "loss": 0.4655, "step": 2603 }, { "epoch": 2.4268406337371853, "grad_norm": 0.1551617541606132, "learning_rate": 4.245771487745944e-06, "loss": 0.4354, "step": 2604 }, { "epoch": 2.427772600186393, "grad_norm": 0.13895956851683222, "learning_rate": 4.238867794269935e-06, "loss": 0.4638, "step": 2605 }, { "epoch": 2.4287045666356013, "grad_norm": 0.1402213636540423, "learning_rate": 4.231964100793925e-06, "loss": 0.4526, "step": 2606 }, { "epoch": 2.429636533084809, "grad_norm": 0.1372758256454388, "learning_rate": 4.225060407317915e-06, "loss": 0.4354, "step": 2607 }, { "epoch": 2.430568499534017, "grad_norm": 0.14625708529963027, "learning_rate": 4.2181567138419055e-06, "loss": 0.4337, "step": 2608 }, { "epoch": 2.4315004659832247, "grad_norm": 0.14121921188783507, "learning_rate": 4.211253020365896e-06, "loss": 0.4396, "step": 2609 }, { "epoch": 2.4324324324324325, "grad_norm": 0.13382918700545263, "learning_rate": 4.204349326889887e-06, "loss": 0.4361, "step": 2610 }, { "epoch": 2.4333643988816402, "grad_norm": 0.14106153104140357, "learning_rate": 4.197445633413877e-06, "loss": 0.4514, "step": 2611 }, { "epoch": 2.434296365330848, "grad_norm": 0.14385228614326348, "learning_rate": 4.190541939937867e-06, "loss": 0.4574, "step": 2612 }, { "epoch": 2.435228331780056, "grad_norm": 0.13320173451667175, "learning_rate": 4.1836382464618575e-06, "loss": 0.412, "step": 2613 }, { "epoch": 2.4361602982292636, "grad_norm": 0.13472660173787254, "learning_rate": 4.176734552985847e-06, "loss": 0.4307, "step": 2614 }, { "epoch": 2.4370922646784714, "grad_norm": 0.13841450606840003, "learning_rate": 4.169830859509838e-06, "loss": 0.4446, "step": 2615 }, { "epoch": 2.438024231127679, "grad_norm": 0.13277651912344027, "learning_rate": 4.162927166033828e-06, "loss": 0.4298, "step": 2616 }, { "epoch": 2.4389561975768874, "grad_norm": 0.13532114062737907, "learning_rate": 4.156023472557819e-06, "loss": 0.4438, "step": 2617 }, { "epoch": 2.439888164026095, "grad_norm": 0.13035835671361032, "learning_rate": 4.149119779081809e-06, "loss": 0.4246, "step": 2618 }, { "epoch": 2.440820130475303, "grad_norm": 0.15246382277596002, "learning_rate": 4.142216085605799e-06, "loss": 0.4518, "step": 2619 }, { "epoch": 2.441752096924511, "grad_norm": 0.14315875186621688, "learning_rate": 4.13531239212979e-06, "loss": 0.4495, "step": 2620 }, { "epoch": 2.4426840633737186, "grad_norm": 0.1367804417175292, "learning_rate": 4.12840869865378e-06, "loss": 0.4619, "step": 2621 }, { "epoch": 2.4436160298229264, "grad_norm": 0.1352176253401343, "learning_rate": 4.121505005177771e-06, "loss": 0.4324, "step": 2622 }, { "epoch": 2.444547996272134, "grad_norm": 0.13210635466059722, "learning_rate": 4.114601311701761e-06, "loss": 0.4277, "step": 2623 }, { "epoch": 2.445479962721342, "grad_norm": 0.13353514991640758, "learning_rate": 4.107697618225751e-06, "loss": 0.4379, "step": 2624 }, { "epoch": 2.4464119291705497, "grad_norm": 0.14228441801034467, "learning_rate": 4.100793924749741e-06, "loss": 0.4412, "step": 2625 }, { "epoch": 2.4473438956197575, "grad_norm": 0.14073897185166925, "learning_rate": 4.093890231273732e-06, "loss": 0.4361, "step": 2626 }, { "epoch": 2.4482758620689653, "grad_norm": 0.14054588106244334, "learning_rate": 4.086986537797722e-06, "loss": 0.4338, "step": 2627 }, { "epoch": 2.4492078285181735, "grad_norm": 0.1451603696703449, "learning_rate": 4.0800828443217126e-06, "loss": 0.4395, "step": 2628 }, { "epoch": 2.4501397949673813, "grad_norm": 0.13730919865224647, "learning_rate": 4.073179150845703e-06, "loss": 0.4552, "step": 2629 }, { "epoch": 2.451071761416589, "grad_norm": 0.13731753935365254, "learning_rate": 4.066275457369693e-06, "loss": 0.4313, "step": 2630 }, { "epoch": 2.452003727865797, "grad_norm": 0.14364896773265975, "learning_rate": 4.059371763893684e-06, "loss": 0.4424, "step": 2631 }, { "epoch": 2.4529356943150047, "grad_norm": 0.1389857703736214, "learning_rate": 4.052468070417674e-06, "loss": 0.4367, "step": 2632 }, { "epoch": 2.4538676607642125, "grad_norm": 0.13951346999274733, "learning_rate": 4.0455643769416645e-06, "loss": 0.4364, "step": 2633 }, { "epoch": 2.4547996272134203, "grad_norm": 0.12998907460169873, "learning_rate": 4.0386606834656544e-06, "loss": 0.4164, "step": 2634 }, { "epoch": 2.455731593662628, "grad_norm": 0.13841656998466897, "learning_rate": 4.031756989989644e-06, "loss": 0.4327, "step": 2635 }, { "epoch": 2.456663560111836, "grad_norm": 0.13585992076595046, "learning_rate": 4.024853296513635e-06, "loss": 0.4158, "step": 2636 }, { "epoch": 2.4575955265610436, "grad_norm": 0.1343881577542756, "learning_rate": 4.017949603037625e-06, "loss": 0.455, "step": 2637 }, { "epoch": 2.4585274930102514, "grad_norm": 0.13572627321253664, "learning_rate": 4.011045909561616e-06, "loss": 0.434, "step": 2638 }, { "epoch": 2.4594594594594597, "grad_norm": 0.13683612550086546, "learning_rate": 4.004142216085606e-06, "loss": 0.4498, "step": 2639 }, { "epoch": 2.4603914259086674, "grad_norm": 0.1368748911375213, "learning_rate": 3.997238522609596e-06, "loss": 0.4282, "step": 2640 }, { "epoch": 2.4613233923578752, "grad_norm": 0.13886315189397333, "learning_rate": 3.990334829133587e-06, "loss": 0.4497, "step": 2641 }, { "epoch": 2.462255358807083, "grad_norm": 0.1451774165379331, "learning_rate": 3.983431135657577e-06, "loss": 0.4344, "step": 2642 }, { "epoch": 2.463187325256291, "grad_norm": 0.13552672543066482, "learning_rate": 3.976527442181568e-06, "loss": 0.4366, "step": 2643 }, { "epoch": 2.4641192917054986, "grad_norm": 0.14033628323198002, "learning_rate": 3.9696237487055575e-06, "loss": 0.4398, "step": 2644 }, { "epoch": 2.4650512581547064, "grad_norm": 0.13381069161013628, "learning_rate": 3.962720055229548e-06, "loss": 0.4016, "step": 2645 }, { "epoch": 2.465983224603914, "grad_norm": 0.13957205176117127, "learning_rate": 3.955816361753538e-06, "loss": 0.4603, "step": 2646 }, { "epoch": 2.466915191053122, "grad_norm": 0.15575716577734527, "learning_rate": 3.948912668277529e-06, "loss": 0.4534, "step": 2647 }, { "epoch": 2.4678471575023297, "grad_norm": 0.13530310081810643, "learning_rate": 3.942008974801519e-06, "loss": 0.43, "step": 2648 }, { "epoch": 2.4687791239515375, "grad_norm": 0.1460792791410086, "learning_rate": 3.9351052813255095e-06, "loss": 0.4324, "step": 2649 }, { "epoch": 2.4697110904007458, "grad_norm": 0.14871170832488337, "learning_rate": 3.9282015878495e-06, "loss": 0.466, "step": 2650 }, { "epoch": 2.4706430568499536, "grad_norm": 0.13599466482153377, "learning_rate": 3.92129789437349e-06, "loss": 0.4174, "step": 2651 }, { "epoch": 2.4715750232991613, "grad_norm": 0.14561860352449868, "learning_rate": 3.914394200897481e-06, "loss": 0.456, "step": 2652 }, { "epoch": 2.472506989748369, "grad_norm": 0.14110729277001138, "learning_rate": 3.907490507421471e-06, "loss": 0.4425, "step": 2653 }, { "epoch": 2.473438956197577, "grad_norm": 0.14623819748423394, "learning_rate": 3.9005868139454615e-06, "loss": 0.4448, "step": 2654 }, { "epoch": 2.4743709226467847, "grad_norm": 0.16798187450006874, "learning_rate": 3.893683120469451e-06, "loss": 0.4597, "step": 2655 }, { "epoch": 2.4753028890959925, "grad_norm": 0.14307786993418803, "learning_rate": 3.886779426993442e-06, "loss": 0.4524, "step": 2656 }, { "epoch": 2.4762348555452003, "grad_norm": 0.12714549495687583, "learning_rate": 3.879875733517432e-06, "loss": 0.4094, "step": 2657 }, { "epoch": 2.477166821994408, "grad_norm": 0.13360806595135422, "learning_rate": 3.872972040041422e-06, "loss": 0.4246, "step": 2658 }, { "epoch": 2.478098788443616, "grad_norm": 0.14122491849794208, "learning_rate": 3.866068346565413e-06, "loss": 0.45, "step": 2659 }, { "epoch": 2.4790307548928237, "grad_norm": 0.136556089137295, "learning_rate": 3.859164653089403e-06, "loss": 0.4456, "step": 2660 }, { "epoch": 2.479962721342032, "grad_norm": 0.13123481604632123, "learning_rate": 3.852260959613394e-06, "loss": 0.4357, "step": 2661 }, { "epoch": 2.4808946877912397, "grad_norm": 0.1450760502125706, "learning_rate": 3.845357266137384e-06, "loss": 0.4674, "step": 2662 }, { "epoch": 2.4818266542404475, "grad_norm": 0.14211550447828852, "learning_rate": 3.838453572661374e-06, "loss": 0.4625, "step": 2663 }, { "epoch": 2.4827586206896552, "grad_norm": 0.13539820895847082, "learning_rate": 3.8315498791853645e-06, "loss": 0.4213, "step": 2664 }, { "epoch": 2.483690587138863, "grad_norm": 0.1322776067922632, "learning_rate": 3.8246461857093544e-06, "loss": 0.4354, "step": 2665 }, { "epoch": 2.484622553588071, "grad_norm": 0.1389981834399688, "learning_rate": 3.817742492233345e-06, "loss": 0.4468, "step": 2666 }, { "epoch": 2.4855545200372786, "grad_norm": 0.14542046826767308, "learning_rate": 3.8108387987573355e-06, "loss": 0.4661, "step": 2667 }, { "epoch": 2.4864864864864864, "grad_norm": 0.12998702607263118, "learning_rate": 3.8039351052813258e-06, "loss": 0.4214, "step": 2668 }, { "epoch": 2.487418452935694, "grad_norm": 0.14681064831819743, "learning_rate": 3.7970314118053165e-06, "loss": 0.414, "step": 2669 }, { "epoch": 2.488350419384902, "grad_norm": 0.13925194748141365, "learning_rate": 3.790127718329307e-06, "loss": 0.4541, "step": 2670 }, { "epoch": 2.4892823858341098, "grad_norm": 0.1408804920119628, "learning_rate": 3.783224024853297e-06, "loss": 0.4559, "step": 2671 }, { "epoch": 2.490214352283318, "grad_norm": 0.1327269113804498, "learning_rate": 3.7763203313772874e-06, "loss": 0.4348, "step": 2672 }, { "epoch": 2.491146318732526, "grad_norm": 0.1479080340689389, "learning_rate": 3.7694166379012777e-06, "loss": 0.488, "step": 2673 }, { "epoch": 2.4920782851817336, "grad_norm": 0.13649837601276302, "learning_rate": 3.7625129444252676e-06, "loss": 0.433, "step": 2674 }, { "epoch": 2.4930102516309414, "grad_norm": 0.1376733740668745, "learning_rate": 3.755609250949258e-06, "loss": 0.4566, "step": 2675 }, { "epoch": 2.493942218080149, "grad_norm": 0.13731385745985322, "learning_rate": 3.7487055574732483e-06, "loss": 0.4393, "step": 2676 }, { "epoch": 2.494874184529357, "grad_norm": 0.13910657856090694, "learning_rate": 3.7418018639972386e-06, "loss": 0.4437, "step": 2677 }, { "epoch": 2.4958061509785647, "grad_norm": 0.13829827376583068, "learning_rate": 3.734898170521229e-06, "loss": 0.4416, "step": 2678 }, { "epoch": 2.4967381174277725, "grad_norm": 0.14411187286980176, "learning_rate": 3.7279944770452196e-06, "loss": 0.4601, "step": 2679 }, { "epoch": 2.4976700838769803, "grad_norm": 0.15365632685588265, "learning_rate": 3.72109078356921e-06, "loss": 0.4663, "step": 2680 }, { "epoch": 2.498602050326188, "grad_norm": 0.13950497115462754, "learning_rate": 3.7141870900932002e-06, "loss": 0.4548, "step": 2681 }, { "epoch": 2.499534016775396, "grad_norm": 0.13976772852250707, "learning_rate": 3.7072833966171905e-06, "loss": 0.4336, "step": 2682 }, { "epoch": 2.500465983224604, "grad_norm": 0.14811414037257425, "learning_rate": 3.700379703141181e-06, "loss": 0.4279, "step": 2683 }, { "epoch": 2.501397949673812, "grad_norm": 0.13760312141545117, "learning_rate": 3.693476009665171e-06, "loss": 0.4317, "step": 2684 }, { "epoch": 2.5023299161230197, "grad_norm": 0.14518328426962834, "learning_rate": 3.6865723161891615e-06, "loss": 0.4613, "step": 2685 }, { "epoch": 2.5032618825722275, "grad_norm": 0.13890688910681995, "learning_rate": 3.6796686227131518e-06, "loss": 0.4632, "step": 2686 }, { "epoch": 2.5041938490214353, "grad_norm": 0.14282655629369312, "learning_rate": 3.672764929237142e-06, "loss": 0.4426, "step": 2687 }, { "epoch": 2.505125815470643, "grad_norm": 0.13209811627663434, "learning_rate": 3.6658612357611324e-06, "loss": 0.4404, "step": 2688 }, { "epoch": 2.506057781919851, "grad_norm": 0.14526631732744588, "learning_rate": 3.6589575422851227e-06, "loss": 0.4609, "step": 2689 }, { "epoch": 2.5069897483690586, "grad_norm": 0.14181285723792103, "learning_rate": 3.6520538488091134e-06, "loss": 0.4571, "step": 2690 }, { "epoch": 2.5079217148182664, "grad_norm": 0.13533230445279626, "learning_rate": 3.6451501553331037e-06, "loss": 0.4512, "step": 2691 }, { "epoch": 2.508853681267474, "grad_norm": 0.13393631607439568, "learning_rate": 3.638246461857094e-06, "loss": 0.4307, "step": 2692 }, { "epoch": 2.509785647716682, "grad_norm": 0.13871752405898863, "learning_rate": 3.6313427683810844e-06, "loss": 0.4317, "step": 2693 }, { "epoch": 2.5107176141658902, "grad_norm": 0.15438976629427967, "learning_rate": 3.6244390749050747e-06, "loss": 0.453, "step": 2694 }, { "epoch": 2.511649580615098, "grad_norm": 0.13536813890354737, "learning_rate": 3.617535381429065e-06, "loss": 0.4385, "step": 2695 }, { "epoch": 2.512581547064306, "grad_norm": 0.1350181332041547, "learning_rate": 3.6106316879530553e-06, "loss": 0.449, "step": 2696 }, { "epoch": 2.5135135135135136, "grad_norm": 0.13694088474904226, "learning_rate": 3.603727994477045e-06, "loss": 0.4379, "step": 2697 }, { "epoch": 2.5144454799627214, "grad_norm": 0.14467527538041813, "learning_rate": 3.5968243010010355e-06, "loss": 0.4535, "step": 2698 }, { "epoch": 2.515377446411929, "grad_norm": 0.1352848516869642, "learning_rate": 3.5899206075250258e-06, "loss": 0.4276, "step": 2699 }, { "epoch": 2.516309412861137, "grad_norm": 0.14028999186035426, "learning_rate": 3.583016914049017e-06, "loss": 0.4317, "step": 2700 }, { "epoch": 2.5172413793103448, "grad_norm": 0.13283555455171486, "learning_rate": 3.576113220573007e-06, "loss": 0.417, "step": 2701 }, { "epoch": 2.5181733457595525, "grad_norm": 0.1410912786034597, "learning_rate": 3.569209527096997e-06, "loss": 0.4713, "step": 2702 }, { "epoch": 2.5191053122087603, "grad_norm": 0.12940251798258065, "learning_rate": 3.5623058336209874e-06, "loss": 0.4144, "step": 2703 }, { "epoch": 2.520037278657968, "grad_norm": 0.1352252888415344, "learning_rate": 3.5554021401449777e-06, "loss": 0.4257, "step": 2704 }, { "epoch": 2.5209692451071763, "grad_norm": 0.1468214652002071, "learning_rate": 3.548498446668968e-06, "loss": 0.4902, "step": 2705 }, { "epoch": 2.5219012115563837, "grad_norm": 0.13804436201363646, "learning_rate": 3.5415947531929584e-06, "loss": 0.4361, "step": 2706 }, { "epoch": 2.522833178005592, "grad_norm": 0.13212833484835007, "learning_rate": 3.5346910597169487e-06, "loss": 0.4492, "step": 2707 }, { "epoch": 2.5237651444547997, "grad_norm": 0.13728698657293661, "learning_rate": 3.527787366240939e-06, "loss": 0.4492, "step": 2708 }, { "epoch": 2.5246971109040075, "grad_norm": 0.13524400610748014, "learning_rate": 3.5208836727649293e-06, "loss": 0.4358, "step": 2709 }, { "epoch": 2.5256290773532153, "grad_norm": 0.1325480109730005, "learning_rate": 3.51397997928892e-06, "loss": 0.4392, "step": 2710 }, { "epoch": 2.526561043802423, "grad_norm": 0.13425329277427245, "learning_rate": 3.5070762858129103e-06, "loss": 0.4506, "step": 2711 }, { "epoch": 2.527493010251631, "grad_norm": 0.13621716188441552, "learning_rate": 3.5001725923369006e-06, "loss": 0.4318, "step": 2712 }, { "epoch": 2.5284249767008387, "grad_norm": 0.1348778193681788, "learning_rate": 3.493268898860891e-06, "loss": 0.4402, "step": 2713 }, { "epoch": 2.5293569431500464, "grad_norm": 0.13852136078689242, "learning_rate": 3.4863652053848813e-06, "loss": 0.4319, "step": 2714 }, { "epoch": 2.5302889095992542, "grad_norm": 0.14203146540075956, "learning_rate": 3.4794615119088716e-06, "loss": 0.4668, "step": 2715 }, { "epoch": 2.5312208760484625, "grad_norm": 0.1338671005493489, "learning_rate": 3.472557818432862e-06, "loss": 0.4198, "step": 2716 }, { "epoch": 2.53215284249767, "grad_norm": 0.14217893128086173, "learning_rate": 3.465654124956852e-06, "loss": 0.4723, "step": 2717 }, { "epoch": 2.533084808946878, "grad_norm": 0.14059692386602904, "learning_rate": 3.4587504314808425e-06, "loss": 0.4678, "step": 2718 }, { "epoch": 2.534016775396086, "grad_norm": 0.13744154654534876, "learning_rate": 3.451846738004833e-06, "loss": 0.4491, "step": 2719 }, { "epoch": 2.5349487418452936, "grad_norm": 0.1309171656445311, "learning_rate": 3.4449430445288235e-06, "loss": 0.436, "step": 2720 }, { "epoch": 2.5358807082945014, "grad_norm": 0.13577895446115928, "learning_rate": 3.438039351052814e-06, "loss": 0.4203, "step": 2721 }, { "epoch": 2.536812674743709, "grad_norm": 0.14559834394403262, "learning_rate": 3.431135657576804e-06, "loss": 0.4532, "step": 2722 }, { "epoch": 2.537744641192917, "grad_norm": 0.13740348715921402, "learning_rate": 3.4242319641007945e-06, "loss": 0.4207, "step": 2723 }, { "epoch": 2.5386766076421248, "grad_norm": 0.12764228847622158, "learning_rate": 3.4173282706247844e-06, "loss": 0.4197, "step": 2724 }, { "epoch": 2.5396085740913326, "grad_norm": 0.1486911886037966, "learning_rate": 3.4104245771487747e-06, "loss": 0.4345, "step": 2725 }, { "epoch": 2.5405405405405403, "grad_norm": 0.13666600989356614, "learning_rate": 3.403520883672765e-06, "loss": 0.436, "step": 2726 }, { "epoch": 2.5414725069897486, "grad_norm": 0.1380600798087464, "learning_rate": 3.3966171901967553e-06, "loss": 0.4376, "step": 2727 }, { "epoch": 2.542404473438956, "grad_norm": 0.132610747131142, "learning_rate": 3.3897134967207456e-06, "loss": 0.4258, "step": 2728 }, { "epoch": 2.543336439888164, "grad_norm": 0.1450154701014088, "learning_rate": 3.382809803244736e-06, "loss": 0.4404, "step": 2729 }, { "epoch": 2.544268406337372, "grad_norm": 0.13787290584498213, "learning_rate": 3.3759061097687266e-06, "loss": 0.4202, "step": 2730 }, { "epoch": 2.5452003727865797, "grad_norm": 0.14308388452863574, "learning_rate": 3.369002416292717e-06, "loss": 0.4557, "step": 2731 }, { "epoch": 2.5461323392357875, "grad_norm": 0.14081637958316565, "learning_rate": 3.3620987228167072e-06, "loss": 0.4312, "step": 2732 }, { "epoch": 2.5470643056849953, "grad_norm": 0.1502774923324391, "learning_rate": 3.3551950293406976e-06, "loss": 0.4547, "step": 2733 }, { "epoch": 2.547996272134203, "grad_norm": 0.14218899498569207, "learning_rate": 3.348291335864688e-06, "loss": 0.4494, "step": 2734 }, { "epoch": 2.548928238583411, "grad_norm": 0.1376188404823951, "learning_rate": 3.341387642388678e-06, "loss": 0.4335, "step": 2735 }, { "epoch": 2.5498602050326187, "grad_norm": 0.14734039337791052, "learning_rate": 3.3344839489126685e-06, "loss": 0.4179, "step": 2736 }, { "epoch": 2.5507921714818265, "grad_norm": 0.13724395878061021, "learning_rate": 3.327580255436659e-06, "loss": 0.4419, "step": 2737 }, { "epoch": 2.5517241379310347, "grad_norm": 0.1379909913214508, "learning_rate": 3.320676561960649e-06, "loss": 0.4258, "step": 2738 }, { "epoch": 2.552656104380242, "grad_norm": 0.13550969028996523, "learning_rate": 3.3137728684846394e-06, "loss": 0.4398, "step": 2739 }, { "epoch": 2.5535880708294503, "grad_norm": 0.13636285495049502, "learning_rate": 3.30686917500863e-06, "loss": 0.4448, "step": 2740 }, { "epoch": 2.554520037278658, "grad_norm": 0.13730878065128524, "learning_rate": 3.2999654815326205e-06, "loss": 0.4329, "step": 2741 }, { "epoch": 2.555452003727866, "grad_norm": 0.13147623381641155, "learning_rate": 3.2930617880566108e-06, "loss": 0.4083, "step": 2742 }, { "epoch": 2.5563839701770736, "grad_norm": 0.1446340880585919, "learning_rate": 3.286158094580601e-06, "loss": 0.4667, "step": 2743 }, { "epoch": 2.5573159366262814, "grad_norm": 0.14563381042335108, "learning_rate": 3.2792544011045914e-06, "loss": 0.4441, "step": 2744 }, { "epoch": 2.558247903075489, "grad_norm": 0.13709845273465998, "learning_rate": 3.2723507076285817e-06, "loss": 0.4426, "step": 2745 }, { "epoch": 2.559179869524697, "grad_norm": 0.14132790010544458, "learning_rate": 3.265447014152572e-06, "loss": 0.4349, "step": 2746 }, { "epoch": 2.560111835973905, "grad_norm": 0.1364652485024354, "learning_rate": 3.258543320676562e-06, "loss": 0.4297, "step": 2747 }, { "epoch": 2.5610438024231126, "grad_norm": 0.14383848020149104, "learning_rate": 3.251639627200552e-06, "loss": 0.4583, "step": 2748 }, { "epoch": 2.561975768872321, "grad_norm": 0.14106221816831485, "learning_rate": 3.2447359337245425e-06, "loss": 0.4471, "step": 2749 }, { "epoch": 2.562907735321528, "grad_norm": 0.13816128764722443, "learning_rate": 3.2378322402485337e-06, "loss": 0.4424, "step": 2750 }, { "epoch": 2.5638397017707364, "grad_norm": 0.13286649235574438, "learning_rate": 3.230928546772524e-06, "loss": 0.4254, "step": 2751 }, { "epoch": 2.564771668219944, "grad_norm": 0.14780149582449129, "learning_rate": 3.224024853296514e-06, "loss": 0.4424, "step": 2752 }, { "epoch": 2.565703634669152, "grad_norm": 0.14066249569576356, "learning_rate": 3.217121159820504e-06, "loss": 0.4557, "step": 2753 }, { "epoch": 2.5666356011183598, "grad_norm": 0.1412784540449233, "learning_rate": 3.2102174663444945e-06, "loss": 0.4529, "step": 2754 }, { "epoch": 2.5675675675675675, "grad_norm": 0.13840622606151898, "learning_rate": 3.2033137728684848e-06, "loss": 0.4323, "step": 2755 }, { "epoch": 2.5684995340167753, "grad_norm": 0.13911046351526568, "learning_rate": 3.196410079392475e-06, "loss": 0.444, "step": 2756 }, { "epoch": 2.569431500465983, "grad_norm": 0.1468758833745106, "learning_rate": 3.1895063859164654e-06, "loss": 0.4334, "step": 2757 }, { "epoch": 2.570363466915191, "grad_norm": 0.14007084762285993, "learning_rate": 3.1826026924404557e-06, "loss": 0.4287, "step": 2758 }, { "epoch": 2.5712954333643987, "grad_norm": 0.14265838545822787, "learning_rate": 3.175698998964446e-06, "loss": 0.4431, "step": 2759 }, { "epoch": 2.572227399813607, "grad_norm": 0.14192284472012817, "learning_rate": 3.1687953054884367e-06, "loss": 0.4813, "step": 2760 }, { "epoch": 2.5731593662628143, "grad_norm": 0.14817271185982545, "learning_rate": 3.161891612012427e-06, "loss": 0.4542, "step": 2761 }, { "epoch": 2.5740913327120225, "grad_norm": 0.1408948150104707, "learning_rate": 3.1549879185364174e-06, "loss": 0.436, "step": 2762 }, { "epoch": 2.5750232991612303, "grad_norm": 0.13641404907233556, "learning_rate": 3.1480842250604077e-06, "loss": 0.4586, "step": 2763 }, { "epoch": 2.575955265610438, "grad_norm": 0.13586019559103316, "learning_rate": 3.141180531584398e-06, "loss": 0.4416, "step": 2764 }, { "epoch": 2.576887232059646, "grad_norm": 0.13836797322479039, "learning_rate": 3.1342768381083883e-06, "loss": 0.445, "step": 2765 }, { "epoch": 2.5778191985088537, "grad_norm": 0.1361508739957106, "learning_rate": 3.1273731446323786e-06, "loss": 0.4516, "step": 2766 }, { "epoch": 2.5787511649580614, "grad_norm": 0.13332913389791584, "learning_rate": 3.120469451156369e-06, "loss": 0.4125, "step": 2767 }, { "epoch": 2.5796831314072692, "grad_norm": 0.14971021386950797, "learning_rate": 3.1135657576803592e-06, "loss": 0.4348, "step": 2768 }, { "epoch": 2.580615097856477, "grad_norm": 0.14292778324349015, "learning_rate": 3.1066620642043495e-06, "loss": 0.4644, "step": 2769 }, { "epoch": 2.581547064305685, "grad_norm": 0.12533923927739266, "learning_rate": 3.0997583707283403e-06, "loss": 0.392, "step": 2770 }, { "epoch": 2.582479030754893, "grad_norm": 0.13923641246594698, "learning_rate": 3.0928546772523306e-06, "loss": 0.4519, "step": 2771 }, { "epoch": 2.5834109972041004, "grad_norm": 0.13806671213527513, "learning_rate": 3.085950983776321e-06, "loss": 0.4187, "step": 2772 }, { "epoch": 2.5843429636533086, "grad_norm": 0.13674705691854006, "learning_rate": 3.079047290300311e-06, "loss": 0.4462, "step": 2773 }, { "epoch": 2.5852749301025164, "grad_norm": 0.13988925287254098, "learning_rate": 3.0721435968243015e-06, "loss": 0.4262, "step": 2774 }, { "epoch": 2.586206896551724, "grad_norm": 0.13736710102971086, "learning_rate": 3.0652399033482914e-06, "loss": 0.4401, "step": 2775 }, { "epoch": 2.587138863000932, "grad_norm": 0.13120786203508678, "learning_rate": 3.0583362098722817e-06, "loss": 0.4255, "step": 2776 }, { "epoch": 2.5880708294501398, "grad_norm": 0.1344095745267228, "learning_rate": 3.051432516396272e-06, "loss": 0.4372, "step": 2777 }, { "epoch": 2.5890027958993476, "grad_norm": 0.13625771116558313, "learning_rate": 3.0445288229202623e-06, "loss": 0.4448, "step": 2778 }, { "epoch": 2.5899347623485554, "grad_norm": 0.1359375026593006, "learning_rate": 3.0376251294442526e-06, "loss": 0.4585, "step": 2779 }, { "epoch": 2.590866728797763, "grad_norm": 0.1358224656944687, "learning_rate": 3.0307214359682433e-06, "loss": 0.447, "step": 2780 }, { "epoch": 2.591798695246971, "grad_norm": 0.13701876162867913, "learning_rate": 3.0238177424922337e-06, "loss": 0.4426, "step": 2781 }, { "epoch": 2.592730661696179, "grad_norm": 0.13722839277885768, "learning_rate": 3.016914049016224e-06, "loss": 0.4545, "step": 2782 }, { "epoch": 2.5936626281453865, "grad_norm": 0.1331430485850252, "learning_rate": 3.0100103555402143e-06, "loss": 0.4512, "step": 2783 }, { "epoch": 2.5945945945945947, "grad_norm": 0.1360171145396635, "learning_rate": 3.0031066620642046e-06, "loss": 0.4182, "step": 2784 }, { "epoch": 2.5955265610438025, "grad_norm": 0.12969762913995464, "learning_rate": 2.996202968588195e-06, "loss": 0.4311, "step": 2785 }, { "epoch": 2.5964585274930103, "grad_norm": 0.13956874821955947, "learning_rate": 2.989299275112185e-06, "loss": 0.4743, "step": 2786 }, { "epoch": 2.597390493942218, "grad_norm": 0.14285022147284482, "learning_rate": 2.9823955816361755e-06, "loss": 0.4463, "step": 2787 }, { "epoch": 2.598322460391426, "grad_norm": 0.13870501957022477, "learning_rate": 2.975491888160166e-06, "loss": 0.4707, "step": 2788 }, { "epoch": 2.5992544268406337, "grad_norm": 0.13401311393850604, "learning_rate": 2.968588194684156e-06, "loss": 0.443, "step": 2789 }, { "epoch": 2.6001863932898415, "grad_norm": 0.13058735679277933, "learning_rate": 2.9616845012081464e-06, "loss": 0.418, "step": 2790 }, { "epoch": 2.6011183597390493, "grad_norm": 0.14222121665745105, "learning_rate": 2.954780807732137e-06, "loss": 0.4729, "step": 2791 }, { "epoch": 2.602050326188257, "grad_norm": 0.1365997880408623, "learning_rate": 2.9478771142561275e-06, "loss": 0.4505, "step": 2792 }, { "epoch": 2.6029822926374653, "grad_norm": 0.13733038561048888, "learning_rate": 2.940973420780118e-06, "loss": 0.4428, "step": 2793 }, { "epoch": 2.6039142590866726, "grad_norm": 0.136859558737128, "learning_rate": 2.934069727304108e-06, "loss": 0.4462, "step": 2794 }, { "epoch": 2.604846225535881, "grad_norm": 0.14671728584745905, "learning_rate": 2.9271660338280984e-06, "loss": 0.4509, "step": 2795 }, { "epoch": 2.6057781919850886, "grad_norm": 0.13952260536135572, "learning_rate": 2.9202623403520887e-06, "loss": 0.4183, "step": 2796 }, { "epoch": 2.6067101584342964, "grad_norm": 0.13287179300414864, "learning_rate": 2.913358646876079e-06, "loss": 0.4313, "step": 2797 }, { "epoch": 2.607642124883504, "grad_norm": 0.13787628229815282, "learning_rate": 2.906454953400069e-06, "loss": 0.4423, "step": 2798 }, { "epoch": 2.608574091332712, "grad_norm": 0.1367692467200814, "learning_rate": 2.8995512599240592e-06, "loss": 0.4529, "step": 2799 }, { "epoch": 2.60950605778192, "grad_norm": 0.13602569756739863, "learning_rate": 2.8926475664480495e-06, "loss": 0.4479, "step": 2800 }, { "epoch": 2.6104380242311276, "grad_norm": 0.14693552673326976, "learning_rate": 2.8857438729720407e-06, "loss": 0.4452, "step": 2801 }, { "epoch": 2.6113699906803354, "grad_norm": 0.1376379945140565, "learning_rate": 2.878840179496031e-06, "loss": 0.451, "step": 2802 }, { "epoch": 2.612301957129543, "grad_norm": 0.13703919888592514, "learning_rate": 2.871936486020021e-06, "loss": 0.4532, "step": 2803 }, { "epoch": 2.6132339235787514, "grad_norm": 0.13842062823806406, "learning_rate": 2.865032792544011e-06, "loss": 0.4468, "step": 2804 }, { "epoch": 2.6141658900279587, "grad_norm": 0.13098039136828074, "learning_rate": 2.8581290990680015e-06, "loss": 0.4344, "step": 2805 }, { "epoch": 2.615097856477167, "grad_norm": 0.13823364064991311, "learning_rate": 2.851225405591992e-06, "loss": 0.4474, "step": 2806 }, { "epoch": 2.6160298229263748, "grad_norm": 0.1301043383846958, "learning_rate": 2.844321712115982e-06, "loss": 0.4224, "step": 2807 }, { "epoch": 2.6169617893755825, "grad_norm": 0.1360565078816951, "learning_rate": 2.8374180186399724e-06, "loss": 0.435, "step": 2808 }, { "epoch": 2.6178937558247903, "grad_norm": 0.14568471185459142, "learning_rate": 2.8305143251639627e-06, "loss": 0.4593, "step": 2809 }, { "epoch": 2.618825722273998, "grad_norm": 0.13410690078163345, "learning_rate": 2.823610631687953e-06, "loss": 0.4204, "step": 2810 }, { "epoch": 2.619757688723206, "grad_norm": 0.13538395370414774, "learning_rate": 2.8167069382119438e-06, "loss": 0.4466, "step": 2811 }, { "epoch": 2.6206896551724137, "grad_norm": 0.14298419988278419, "learning_rate": 2.809803244735934e-06, "loss": 0.4469, "step": 2812 }, { "epoch": 2.6216216216216215, "grad_norm": 0.1353046394514826, "learning_rate": 2.8028995512599244e-06, "loss": 0.4557, "step": 2813 }, { "epoch": 2.6225535880708293, "grad_norm": 0.14586410275224362, "learning_rate": 2.7959958577839147e-06, "loss": 0.4539, "step": 2814 }, { "epoch": 2.6234855545200375, "grad_norm": 0.13472310501980173, "learning_rate": 2.789092164307905e-06, "loss": 0.4133, "step": 2815 }, { "epoch": 2.624417520969245, "grad_norm": 0.13999159463210908, "learning_rate": 2.7821884708318953e-06, "loss": 0.4525, "step": 2816 }, { "epoch": 2.625349487418453, "grad_norm": 0.1317332235438797, "learning_rate": 2.7752847773558856e-06, "loss": 0.4421, "step": 2817 }, { "epoch": 2.626281453867661, "grad_norm": 0.14596467025522644, "learning_rate": 2.768381083879876e-06, "loss": 0.4724, "step": 2818 }, { "epoch": 2.6272134203168687, "grad_norm": 0.14902102215711172, "learning_rate": 2.7614773904038662e-06, "loss": 0.4581, "step": 2819 }, { "epoch": 2.6281453867660765, "grad_norm": 0.13889912758744005, "learning_rate": 2.7545736969278566e-06, "loss": 0.4552, "step": 2820 }, { "epoch": 2.6290773532152842, "grad_norm": 0.13874559025023625, "learning_rate": 2.7476700034518473e-06, "loss": 0.4648, "step": 2821 }, { "epoch": 2.630009319664492, "grad_norm": 0.13878117845309548, "learning_rate": 2.7407663099758376e-06, "loss": 0.4526, "step": 2822 }, { "epoch": 2.6309412861137, "grad_norm": 0.13203524589866641, "learning_rate": 2.733862616499828e-06, "loss": 0.4373, "step": 2823 }, { "epoch": 2.6318732525629076, "grad_norm": 0.14080741165968752, "learning_rate": 2.7269589230238182e-06, "loss": 0.4397, "step": 2824 }, { "epoch": 2.6328052190121154, "grad_norm": 0.13789650690662064, "learning_rate": 2.7200552295478085e-06, "loss": 0.4391, "step": 2825 }, { "epoch": 2.6337371854613236, "grad_norm": 0.13188915571854448, "learning_rate": 2.7131515360717984e-06, "loss": 0.44, "step": 2826 }, { "epoch": 2.634669151910531, "grad_norm": 0.13642955306243665, "learning_rate": 2.7062478425957887e-06, "loss": 0.4667, "step": 2827 }, { "epoch": 2.635601118359739, "grad_norm": 0.1382090845870011, "learning_rate": 2.699344149119779e-06, "loss": 0.4211, "step": 2828 }, { "epoch": 2.636533084808947, "grad_norm": 0.13822757229961252, "learning_rate": 2.6924404556437693e-06, "loss": 0.4524, "step": 2829 }, { "epoch": 2.637465051258155, "grad_norm": 0.13305719283732864, "learning_rate": 2.6855367621677596e-06, "loss": 0.429, "step": 2830 }, { "epoch": 2.6383970177073626, "grad_norm": 0.13797568712028924, "learning_rate": 2.6786330686917504e-06, "loss": 0.4426, "step": 2831 }, { "epoch": 2.6393289841565704, "grad_norm": 0.13633363782946684, "learning_rate": 2.6717293752157407e-06, "loss": 0.4312, "step": 2832 }, { "epoch": 2.640260950605778, "grad_norm": 0.1411655300708518, "learning_rate": 2.664825681739731e-06, "loss": 0.4435, "step": 2833 }, { "epoch": 2.641192917054986, "grad_norm": 0.1332232404667683, "learning_rate": 2.6579219882637213e-06, "loss": 0.4391, "step": 2834 }, { "epoch": 2.6421248835041937, "grad_norm": 0.14287913260918503, "learning_rate": 2.6510182947877116e-06, "loss": 0.4857, "step": 2835 }, { "epoch": 2.6430568499534015, "grad_norm": 0.13169933285444677, "learning_rate": 2.644114601311702e-06, "loss": 0.4217, "step": 2836 }, { "epoch": 2.6439888164026097, "grad_norm": 0.1409799857412641, "learning_rate": 2.6372109078356922e-06, "loss": 0.4552, "step": 2837 }, { "epoch": 2.644920782851817, "grad_norm": 0.14776123000382416, "learning_rate": 2.6303072143596825e-06, "loss": 0.4723, "step": 2838 }, { "epoch": 2.6458527493010253, "grad_norm": 0.13630561177700923, "learning_rate": 2.623403520883673e-06, "loss": 0.4408, "step": 2839 }, { "epoch": 2.646784715750233, "grad_norm": 0.13565369872405902, "learning_rate": 2.616499827407663e-06, "loss": 0.4347, "step": 2840 }, { "epoch": 2.647716682199441, "grad_norm": 0.1344102615161753, "learning_rate": 2.609596133931654e-06, "loss": 0.4209, "step": 2841 }, { "epoch": 2.6486486486486487, "grad_norm": 0.13359023005259024, "learning_rate": 2.602692440455644e-06, "loss": 0.4422, "step": 2842 }, { "epoch": 2.6495806150978565, "grad_norm": 0.14160172078122166, "learning_rate": 2.5957887469796345e-06, "loss": 0.4485, "step": 2843 }, { "epoch": 2.6505125815470643, "grad_norm": 0.13809784386616483, "learning_rate": 2.588885053503625e-06, "loss": 0.4455, "step": 2844 }, { "epoch": 2.651444547996272, "grad_norm": 0.137216532610375, "learning_rate": 2.581981360027615e-06, "loss": 0.4436, "step": 2845 }, { "epoch": 2.65237651444548, "grad_norm": 0.13666341896217069, "learning_rate": 2.5750776665516054e-06, "loss": 0.4279, "step": 2846 }, { "epoch": 2.6533084808946876, "grad_norm": 0.13836574660223916, "learning_rate": 2.5681739730755957e-06, "loss": 0.4642, "step": 2847 }, { "epoch": 2.654240447343896, "grad_norm": 0.13265113082341234, "learning_rate": 2.561270279599586e-06, "loss": 0.4384, "step": 2848 }, { "epoch": 2.655172413793103, "grad_norm": 0.1327159136358331, "learning_rate": 2.554366586123576e-06, "loss": 0.4391, "step": 2849 }, { "epoch": 2.6561043802423114, "grad_norm": 0.1435928127772928, "learning_rate": 2.5474628926475662e-06, "loss": 0.4783, "step": 2850 }, { "epoch": 2.6570363466915192, "grad_norm": 0.13763661020529114, "learning_rate": 2.5405591991715574e-06, "loss": 0.4555, "step": 2851 }, { "epoch": 2.657968313140727, "grad_norm": 0.13518447351121307, "learning_rate": 2.5336555056955477e-06, "loss": 0.4363, "step": 2852 }, { "epoch": 2.658900279589935, "grad_norm": 0.13763397309465245, "learning_rate": 2.5267518122195376e-06, "loss": 0.4483, "step": 2853 }, { "epoch": 2.6598322460391426, "grad_norm": 0.13111045915322558, "learning_rate": 2.519848118743528e-06, "loss": 0.4406, "step": 2854 }, { "epoch": 2.6607642124883504, "grad_norm": 0.1364757780728356, "learning_rate": 2.5129444252675182e-06, "loss": 0.4553, "step": 2855 }, { "epoch": 2.661696178937558, "grad_norm": 0.13575878328824295, "learning_rate": 2.5060407317915085e-06, "loss": 0.4484, "step": 2856 }, { "epoch": 2.662628145386766, "grad_norm": 0.14393453652320143, "learning_rate": 2.499137038315499e-06, "loss": 0.4691, "step": 2857 }, { "epoch": 2.6635601118359737, "grad_norm": 0.1308129244220445, "learning_rate": 2.492233344839489e-06, "loss": 0.4377, "step": 2858 }, { "epoch": 2.664492078285182, "grad_norm": 0.13563027472569053, "learning_rate": 2.48532965136348e-06, "loss": 0.4352, "step": 2859 }, { "epoch": 2.6654240447343893, "grad_norm": 0.15357961769002593, "learning_rate": 2.47842595788747e-06, "loss": 0.4803, "step": 2860 }, { "epoch": 2.6663560111835976, "grad_norm": 0.1322522901627914, "learning_rate": 2.4715222644114605e-06, "loss": 0.4438, "step": 2861 }, { "epoch": 2.6672879776328053, "grad_norm": 0.14548984890164035, "learning_rate": 2.464618570935451e-06, "loss": 0.4948, "step": 2862 }, { "epoch": 2.668219944082013, "grad_norm": 0.13126060557168354, "learning_rate": 2.4577148774594407e-06, "loss": 0.4323, "step": 2863 }, { "epoch": 2.669151910531221, "grad_norm": 0.14150184083616812, "learning_rate": 2.4508111839834314e-06, "loss": 0.4324, "step": 2864 }, { "epoch": 2.6700838769804287, "grad_norm": 0.13424633057194796, "learning_rate": 2.4439074905074217e-06, "loss": 0.4445, "step": 2865 }, { "epoch": 2.6710158434296365, "grad_norm": 0.1459166212391059, "learning_rate": 2.437003797031412e-06, "loss": 0.4435, "step": 2866 }, { "epoch": 2.6719478098788443, "grad_norm": 0.14187027557309703, "learning_rate": 2.4301001035554023e-06, "loss": 0.4529, "step": 2867 }, { "epoch": 2.672879776328052, "grad_norm": 0.12904652478613426, "learning_rate": 2.4231964100793927e-06, "loss": 0.4344, "step": 2868 }, { "epoch": 2.67381174277726, "grad_norm": 0.13273381080199015, "learning_rate": 2.416292716603383e-06, "loss": 0.4213, "step": 2869 }, { "epoch": 2.674743709226468, "grad_norm": 0.13885025959615535, "learning_rate": 2.4093890231273733e-06, "loss": 0.4309, "step": 2870 }, { "epoch": 2.6756756756756754, "grad_norm": 0.1433879549790054, "learning_rate": 2.4024853296513636e-06, "loss": 0.4735, "step": 2871 }, { "epoch": 2.6766076421248837, "grad_norm": 0.14003549727028886, "learning_rate": 2.395581636175354e-06, "loss": 0.4588, "step": 2872 }, { "epoch": 2.6775396085740915, "grad_norm": 0.13812285641321786, "learning_rate": 2.388677942699344e-06, "loss": 0.4416, "step": 2873 }, { "epoch": 2.6784715750232992, "grad_norm": 0.1453195386486805, "learning_rate": 2.381774249223335e-06, "loss": 0.4639, "step": 2874 }, { "epoch": 2.679403541472507, "grad_norm": 0.14369282027328367, "learning_rate": 2.3748705557473252e-06, "loss": 0.4857, "step": 2875 }, { "epoch": 2.680335507921715, "grad_norm": 0.14393499813534666, "learning_rate": 2.367966862271315e-06, "loss": 0.4676, "step": 2876 }, { "epoch": 2.6812674743709226, "grad_norm": 0.14019913692234529, "learning_rate": 2.3610631687953054e-06, "loss": 0.4497, "step": 2877 }, { "epoch": 2.6821994408201304, "grad_norm": 0.14289937346851558, "learning_rate": 2.3541594753192957e-06, "loss": 0.4693, "step": 2878 }, { "epoch": 2.683131407269338, "grad_norm": 0.13936284809690327, "learning_rate": 2.3472557818432865e-06, "loss": 0.4311, "step": 2879 }, { "epoch": 2.684063373718546, "grad_norm": 0.1327253537478337, "learning_rate": 2.3403520883672768e-06, "loss": 0.4448, "step": 2880 }, { "epoch": 2.684995340167754, "grad_norm": 0.1365622312344743, "learning_rate": 2.333448394891267e-06, "loss": 0.4193, "step": 2881 }, { "epoch": 2.6859273066169616, "grad_norm": 0.13760645472850347, "learning_rate": 2.3265447014152574e-06, "loss": 0.4293, "step": 2882 }, { "epoch": 2.68685927306617, "grad_norm": 0.13082288851202406, "learning_rate": 2.3196410079392477e-06, "loss": 0.4337, "step": 2883 }, { "epoch": 2.6877912395153776, "grad_norm": 0.141022314082249, "learning_rate": 2.312737314463238e-06, "loss": 0.4553, "step": 2884 }, { "epoch": 2.6887232059645854, "grad_norm": 0.1456466464857144, "learning_rate": 2.3058336209872283e-06, "loss": 0.4666, "step": 2885 }, { "epoch": 2.689655172413793, "grad_norm": 0.13770858071477687, "learning_rate": 2.2989299275112186e-06, "loss": 0.4596, "step": 2886 }, { "epoch": 2.690587138863001, "grad_norm": 0.14653494706419043, "learning_rate": 2.292026234035209e-06, "loss": 0.4499, "step": 2887 }, { "epoch": 2.6915191053122087, "grad_norm": 0.13891409639028576, "learning_rate": 2.2851225405591993e-06, "loss": 0.4454, "step": 2888 }, { "epoch": 2.6924510717614165, "grad_norm": 0.1276377513725637, "learning_rate": 2.27821884708319e-06, "loss": 0.4401, "step": 2889 }, { "epoch": 2.6933830382106243, "grad_norm": 0.13165157296389754, "learning_rate": 2.27131515360718e-06, "loss": 0.4386, "step": 2890 }, { "epoch": 2.694315004659832, "grad_norm": 0.13850151373339675, "learning_rate": 2.26441146013117e-06, "loss": 0.4565, "step": 2891 }, { "epoch": 2.6952469711090403, "grad_norm": 0.1447648683442491, "learning_rate": 2.2575077666551605e-06, "loss": 0.4333, "step": 2892 }, { "epoch": 2.6961789375582477, "grad_norm": 0.14654653538030296, "learning_rate": 2.250604073179151e-06, "loss": 0.4563, "step": 2893 }, { "epoch": 2.697110904007456, "grad_norm": 0.1359299954957155, "learning_rate": 2.2437003797031415e-06, "loss": 0.449, "step": 2894 }, { "epoch": 2.6980428704566637, "grad_norm": 0.13847975273704569, "learning_rate": 2.236796686227132e-06, "loss": 0.4471, "step": 2895 }, { "epoch": 2.6989748369058715, "grad_norm": 0.13093604453320007, "learning_rate": 2.229892992751122e-06, "loss": 0.4202, "step": 2896 }, { "epoch": 2.6999068033550793, "grad_norm": 0.12996100046320108, "learning_rate": 2.2229892992751125e-06, "loss": 0.4249, "step": 2897 }, { "epoch": 2.700838769804287, "grad_norm": 0.13807454631211927, "learning_rate": 2.2160856057991028e-06, "loss": 0.4437, "step": 2898 }, { "epoch": 2.701770736253495, "grad_norm": 0.14208724605467535, "learning_rate": 2.209181912323093e-06, "loss": 0.4325, "step": 2899 }, { "epoch": 2.7027027027027026, "grad_norm": 0.13932848292476527, "learning_rate": 2.2022782188470834e-06, "loss": 0.4548, "step": 2900 }, { "epoch": 2.7036346691519104, "grad_norm": 0.1323336158520391, "learning_rate": 2.1953745253710737e-06, "loss": 0.4277, "step": 2901 }, { "epoch": 2.704566635601118, "grad_norm": 0.1328528858890006, "learning_rate": 2.188470831895064e-06, "loss": 0.4235, "step": 2902 }, { "epoch": 2.7054986020503264, "grad_norm": 0.1378499811004366, "learning_rate": 2.1815671384190543e-06, "loss": 0.4422, "step": 2903 }, { "epoch": 2.706430568499534, "grad_norm": 0.14082529574327576, "learning_rate": 2.1746634449430446e-06, "loss": 0.451, "step": 2904 }, { "epoch": 2.707362534948742, "grad_norm": 0.13491474864164446, "learning_rate": 2.167759751467035e-06, "loss": 0.4501, "step": 2905 }, { "epoch": 2.70829450139795, "grad_norm": 0.14166207719154317, "learning_rate": 2.1608560579910252e-06, "loss": 0.4886, "step": 2906 }, { "epoch": 2.7092264678471576, "grad_norm": 0.13546226595218958, "learning_rate": 2.1539523645150156e-06, "loss": 0.4744, "step": 2907 }, { "epoch": 2.7101584342963654, "grad_norm": 0.13205410225716152, "learning_rate": 2.147048671039006e-06, "loss": 0.4286, "step": 2908 }, { "epoch": 2.711090400745573, "grad_norm": 0.1407516255176865, "learning_rate": 2.1401449775629966e-06, "loss": 0.4353, "step": 2909 }, { "epoch": 2.712022367194781, "grad_norm": 0.1450324570671356, "learning_rate": 2.133241284086987e-06, "loss": 0.4665, "step": 2910 }, { "epoch": 2.7129543336439887, "grad_norm": 0.13698991857653026, "learning_rate": 2.1263375906109772e-06, "loss": 0.44, "step": 2911 }, { "epoch": 2.7138863000931965, "grad_norm": 0.13680362176162522, "learning_rate": 2.1194338971349675e-06, "loss": 0.4465, "step": 2912 }, { "epoch": 2.7148182665424043, "grad_norm": 0.1511183029462366, "learning_rate": 2.1125302036589574e-06, "loss": 0.4859, "step": 2913 }, { "epoch": 2.7157502329916126, "grad_norm": 0.13752984757717468, "learning_rate": 2.105626510182948e-06, "loss": 0.4184, "step": 2914 }, { "epoch": 2.71668219944082, "grad_norm": 0.14734959605241665, "learning_rate": 2.0987228167069384e-06, "loss": 0.4356, "step": 2915 }, { "epoch": 2.717614165890028, "grad_norm": 0.13535994637368298, "learning_rate": 2.0918191232309288e-06, "loss": 0.4404, "step": 2916 }, { "epoch": 2.718546132339236, "grad_norm": 0.13272696122564573, "learning_rate": 2.084915429754919e-06, "loss": 0.4185, "step": 2917 }, { "epoch": 2.7194780987884437, "grad_norm": 0.13190199190652832, "learning_rate": 2.0780117362789094e-06, "loss": 0.4406, "step": 2918 }, { "epoch": 2.7204100652376515, "grad_norm": 0.14404490602847933, "learning_rate": 2.0711080428028997e-06, "loss": 0.4636, "step": 2919 }, { "epoch": 2.7213420316868593, "grad_norm": 0.13570212871773243, "learning_rate": 2.06420434932689e-06, "loss": 0.4367, "step": 2920 }, { "epoch": 2.722273998136067, "grad_norm": 0.14498445105091476, "learning_rate": 2.0573006558508803e-06, "loss": 0.4463, "step": 2921 }, { "epoch": 2.723205964585275, "grad_norm": 0.1394339287597516, "learning_rate": 2.0503969623748706e-06, "loss": 0.4459, "step": 2922 }, { "epoch": 2.7241379310344827, "grad_norm": 0.13318250091745412, "learning_rate": 2.043493268898861e-06, "loss": 0.4143, "step": 2923 }, { "epoch": 2.7250698974836904, "grad_norm": 0.14003906544130001, "learning_rate": 2.0365895754228517e-06, "loss": 0.4443, "step": 2924 }, { "epoch": 2.7260018639328987, "grad_norm": 0.13345893525874544, "learning_rate": 2.029685881946842e-06, "loss": 0.436, "step": 2925 }, { "epoch": 2.726933830382106, "grad_norm": 0.14175302926381417, "learning_rate": 2.0227821884708323e-06, "loss": 0.4558, "step": 2926 }, { "epoch": 2.7278657968313142, "grad_norm": 0.14227473202001042, "learning_rate": 2.015878494994822e-06, "loss": 0.4659, "step": 2927 }, { "epoch": 2.728797763280522, "grad_norm": 0.1875685192633546, "learning_rate": 2.0089748015188125e-06, "loss": 0.4274, "step": 2928 }, { "epoch": 2.72972972972973, "grad_norm": 0.14053626258005128, "learning_rate": 2.002071108042803e-06, "loss": 0.4563, "step": 2929 }, { "epoch": 2.7306616961789376, "grad_norm": 0.1395965115608562, "learning_rate": 1.9951674145667935e-06, "loss": 0.4417, "step": 2930 }, { "epoch": 2.7315936626281454, "grad_norm": 0.1380807426588909, "learning_rate": 1.988263721090784e-06, "loss": 0.4288, "step": 2931 }, { "epoch": 2.732525629077353, "grad_norm": 0.14030995168371743, "learning_rate": 1.981360027614774e-06, "loss": 0.4476, "step": 2932 }, { "epoch": 2.733457595526561, "grad_norm": 0.13492107610808782, "learning_rate": 1.9744563341387644e-06, "loss": 0.4421, "step": 2933 }, { "epoch": 2.7343895619757688, "grad_norm": 0.14257366683435507, "learning_rate": 1.9675526406627547e-06, "loss": 0.457, "step": 2934 }, { "epoch": 2.7353215284249766, "grad_norm": 0.13234176326450312, "learning_rate": 1.960648947186745e-06, "loss": 0.4218, "step": 2935 }, { "epoch": 2.736253494874185, "grad_norm": 0.13554000120665055, "learning_rate": 1.9537452537107354e-06, "loss": 0.4456, "step": 2936 }, { "epoch": 2.737185461323392, "grad_norm": 0.1391600327012469, "learning_rate": 1.9468415602347257e-06, "loss": 0.4722, "step": 2937 }, { "epoch": 2.7381174277726004, "grad_norm": 0.1390827777855385, "learning_rate": 1.939937866758716e-06, "loss": 0.446, "step": 2938 }, { "epoch": 2.739049394221808, "grad_norm": 0.14021483900062998, "learning_rate": 1.9330341732827067e-06, "loss": 0.4386, "step": 2939 }, { "epoch": 2.739981360671016, "grad_norm": 0.14340472411345392, "learning_rate": 1.926130479806697e-06, "loss": 0.4575, "step": 2940 }, { "epoch": 2.7409133271202237, "grad_norm": 0.13789634075211368, "learning_rate": 1.919226786330687e-06, "loss": 0.4421, "step": 2941 }, { "epoch": 2.7418452935694315, "grad_norm": 0.14044846452309978, "learning_rate": 1.9123230928546772e-06, "loss": 0.4676, "step": 2942 }, { "epoch": 2.7427772600186393, "grad_norm": 0.1312476831148715, "learning_rate": 1.9054193993786677e-06, "loss": 0.4266, "step": 2943 }, { "epoch": 2.743709226467847, "grad_norm": 0.13490153319151635, "learning_rate": 1.8985157059026583e-06, "loss": 0.4472, "step": 2944 }, { "epoch": 2.744641192917055, "grad_norm": 0.131176040987014, "learning_rate": 1.8916120124266486e-06, "loss": 0.4273, "step": 2945 }, { "epoch": 2.7455731593662627, "grad_norm": 0.13651476924912914, "learning_rate": 1.8847083189506389e-06, "loss": 0.4618, "step": 2946 }, { "epoch": 2.746505125815471, "grad_norm": 0.1342458774360753, "learning_rate": 1.877804625474629e-06, "loss": 0.4423, "step": 2947 }, { "epoch": 2.7474370922646782, "grad_norm": 0.13544771801723351, "learning_rate": 1.8709009319986193e-06, "loss": 0.4409, "step": 2948 }, { "epoch": 2.7483690587138865, "grad_norm": 0.13738011965857128, "learning_rate": 1.8639972385226098e-06, "loss": 0.4602, "step": 2949 }, { "epoch": 2.7493010251630943, "grad_norm": 0.13582793483513053, "learning_rate": 1.8570935450466001e-06, "loss": 0.4624, "step": 2950 }, { "epoch": 2.750232991612302, "grad_norm": 0.13081182073584174, "learning_rate": 1.8501898515705904e-06, "loss": 0.4272, "step": 2951 }, { "epoch": 2.75116495806151, "grad_norm": 0.13321386493418377, "learning_rate": 1.8432861580945807e-06, "loss": 0.4191, "step": 2952 }, { "epoch": 2.7520969245107176, "grad_norm": 0.1425219664904654, "learning_rate": 1.836382464618571e-06, "loss": 0.4741, "step": 2953 }, { "epoch": 2.7530288909599254, "grad_norm": 0.1321290174281227, "learning_rate": 1.8294787711425613e-06, "loss": 0.4365, "step": 2954 }, { "epoch": 2.753960857409133, "grad_norm": 0.13164923826715352, "learning_rate": 1.8225750776665519e-06, "loss": 0.4206, "step": 2955 }, { "epoch": 2.754892823858341, "grad_norm": 0.13960846333971336, "learning_rate": 1.8156713841905422e-06, "loss": 0.4491, "step": 2956 }, { "epoch": 2.755824790307549, "grad_norm": 0.13864211085013686, "learning_rate": 1.8087676907145325e-06, "loss": 0.4337, "step": 2957 }, { "epoch": 2.756756756756757, "grad_norm": 0.136225431273349, "learning_rate": 1.8018639972385226e-06, "loss": 0.4436, "step": 2958 }, { "epoch": 2.7576887232059644, "grad_norm": 0.13142525737264663, "learning_rate": 1.7949603037625129e-06, "loss": 0.4457, "step": 2959 }, { "epoch": 2.7586206896551726, "grad_norm": 0.13501432313733677, "learning_rate": 1.7880566102865034e-06, "loss": 0.4167, "step": 2960 }, { "epoch": 2.7595526561043804, "grad_norm": 0.13875712594406245, "learning_rate": 1.7811529168104937e-06, "loss": 0.4652, "step": 2961 }, { "epoch": 2.760484622553588, "grad_norm": 0.14596241943107896, "learning_rate": 1.774249223334484e-06, "loss": 0.4584, "step": 2962 }, { "epoch": 2.761416589002796, "grad_norm": 0.13835846955999564, "learning_rate": 1.7673455298584743e-06, "loss": 0.4368, "step": 2963 }, { "epoch": 2.7623485554520038, "grad_norm": 0.13510775391744445, "learning_rate": 1.7604418363824646e-06, "loss": 0.4186, "step": 2964 }, { "epoch": 2.7632805219012115, "grad_norm": 0.1401370820708152, "learning_rate": 1.7535381429064552e-06, "loss": 0.4334, "step": 2965 }, { "epoch": 2.7642124883504193, "grad_norm": 0.13751021052142695, "learning_rate": 1.7466344494304455e-06, "loss": 0.4637, "step": 2966 }, { "epoch": 2.765144454799627, "grad_norm": 0.13315111252014455, "learning_rate": 1.7397307559544358e-06, "loss": 0.4181, "step": 2967 }, { "epoch": 2.766076421248835, "grad_norm": 0.1467250297043431, "learning_rate": 1.732827062478426e-06, "loss": 0.457, "step": 2968 }, { "epoch": 2.767008387698043, "grad_norm": 0.14104457595779538, "learning_rate": 1.7259233690024164e-06, "loss": 0.4623, "step": 2969 }, { "epoch": 2.7679403541472505, "grad_norm": 0.134418155534878, "learning_rate": 1.719019675526407e-06, "loss": 0.4453, "step": 2970 }, { "epoch": 2.7688723205964587, "grad_norm": 0.13667303355920038, "learning_rate": 1.7121159820503972e-06, "loss": 0.4448, "step": 2971 }, { "epoch": 2.7698042870456665, "grad_norm": 0.13392739392088931, "learning_rate": 1.7052122885743873e-06, "loss": 0.4436, "step": 2972 }, { "epoch": 2.7707362534948743, "grad_norm": 0.13967890579852146, "learning_rate": 1.6983085950983776e-06, "loss": 0.4411, "step": 2973 }, { "epoch": 2.771668219944082, "grad_norm": 0.14012663445136758, "learning_rate": 1.691404901622368e-06, "loss": 0.4574, "step": 2974 }, { "epoch": 2.77260018639329, "grad_norm": 0.13871856273251382, "learning_rate": 1.6845012081463585e-06, "loss": 0.445, "step": 2975 }, { "epoch": 2.7735321528424977, "grad_norm": 0.1372357525173242, "learning_rate": 1.6775975146703488e-06, "loss": 0.4545, "step": 2976 }, { "epoch": 2.7744641192917054, "grad_norm": 0.13224665003647468, "learning_rate": 1.670693821194339e-06, "loss": 0.4398, "step": 2977 }, { "epoch": 2.7753960857409132, "grad_norm": 0.1321648100195117, "learning_rate": 1.6637901277183294e-06, "loss": 0.4244, "step": 2978 }, { "epoch": 2.776328052190121, "grad_norm": 0.13122270592527555, "learning_rate": 1.6568864342423197e-06, "loss": 0.442, "step": 2979 }, { "epoch": 2.7772600186393293, "grad_norm": 0.13638162212713892, "learning_rate": 1.6499827407663102e-06, "loss": 0.4328, "step": 2980 }, { "epoch": 2.7781919850885366, "grad_norm": 0.13642970745343566, "learning_rate": 1.6430790472903005e-06, "loss": 0.43, "step": 2981 }, { "epoch": 2.779123951537745, "grad_norm": 0.14406426207538903, "learning_rate": 1.6361753538142908e-06, "loss": 0.4897, "step": 2982 }, { "epoch": 2.7800559179869526, "grad_norm": 0.1286392236529146, "learning_rate": 1.629271660338281e-06, "loss": 0.4308, "step": 2983 }, { "epoch": 2.7809878844361604, "grad_norm": 0.14003540251607544, "learning_rate": 1.6223679668622713e-06, "loss": 0.4617, "step": 2984 }, { "epoch": 2.781919850885368, "grad_norm": 0.13169481742818126, "learning_rate": 1.615464273386262e-06, "loss": 0.4448, "step": 2985 }, { "epoch": 2.782851817334576, "grad_norm": 0.14153430924915641, "learning_rate": 1.608560579910252e-06, "loss": 0.4598, "step": 2986 }, { "epoch": 2.7837837837837838, "grad_norm": 0.1380248369020922, "learning_rate": 1.6016568864342424e-06, "loss": 0.455, "step": 2987 }, { "epoch": 2.7847157502329916, "grad_norm": 0.12753413083942627, "learning_rate": 1.5947531929582327e-06, "loss": 0.4344, "step": 2988 }, { "epoch": 2.7856477166821993, "grad_norm": 0.2846693394332861, "learning_rate": 1.587849499482223e-06, "loss": 0.4342, "step": 2989 }, { "epoch": 2.786579683131407, "grad_norm": 0.1399242613354117, "learning_rate": 1.5809458060062135e-06, "loss": 0.4433, "step": 2990 }, { "epoch": 2.7875116495806154, "grad_norm": 0.1288933132877417, "learning_rate": 1.5740421125302038e-06, "loss": 0.4318, "step": 2991 }, { "epoch": 2.7884436160298227, "grad_norm": 0.12770396674048054, "learning_rate": 1.5671384190541941e-06, "loss": 0.4211, "step": 2992 }, { "epoch": 2.789375582479031, "grad_norm": 0.13438429296839607, "learning_rate": 1.5602347255781845e-06, "loss": 0.4551, "step": 2993 }, { "epoch": 2.7903075489282387, "grad_norm": 0.13341782816832481, "learning_rate": 1.5533310321021748e-06, "loss": 0.4554, "step": 2994 }, { "epoch": 2.7912395153774465, "grad_norm": 0.13403947990846513, "learning_rate": 1.5464273386261653e-06, "loss": 0.4325, "step": 2995 }, { "epoch": 2.7921714818266543, "grad_norm": 0.1331628010475491, "learning_rate": 1.5395236451501556e-06, "loss": 0.4481, "step": 2996 }, { "epoch": 2.793103448275862, "grad_norm": 0.1325350444427255, "learning_rate": 1.5326199516741457e-06, "loss": 0.4286, "step": 2997 }, { "epoch": 2.79403541472507, "grad_norm": 0.14120324292842618, "learning_rate": 1.525716258198136e-06, "loss": 0.4417, "step": 2998 }, { "epoch": 2.7949673811742777, "grad_norm": 0.13640792752402073, "learning_rate": 1.5188125647221263e-06, "loss": 0.4479, "step": 2999 }, { "epoch": 2.7958993476234855, "grad_norm": 0.13599184479278967, "learning_rate": 1.5119088712461168e-06, "loss": 0.4229, "step": 3000 }, { "epoch": 2.7968313140726933, "grad_norm": 0.13648480103969435, "learning_rate": 1.5050051777701071e-06, "loss": 0.4526, "step": 3001 }, { "epoch": 2.7977632805219015, "grad_norm": 0.19271487998996098, "learning_rate": 1.4981014842940974e-06, "loss": 0.4538, "step": 3002 }, { "epoch": 2.798695246971109, "grad_norm": 0.13464992150492316, "learning_rate": 1.4911977908180878e-06, "loss": 0.4223, "step": 3003 }, { "epoch": 2.799627213420317, "grad_norm": 0.14089508428270137, "learning_rate": 1.484294097342078e-06, "loss": 0.4573, "step": 3004 }, { "epoch": 2.800559179869525, "grad_norm": 0.13749235020233316, "learning_rate": 1.4773904038660686e-06, "loss": 0.456, "step": 3005 }, { "epoch": 2.8014911463187326, "grad_norm": 0.14126417348842452, "learning_rate": 1.470486710390059e-06, "loss": 0.4892, "step": 3006 }, { "epoch": 2.8024231127679404, "grad_norm": 0.1490094557980083, "learning_rate": 1.4635830169140492e-06, "loss": 0.5009, "step": 3007 }, { "epoch": 2.803355079217148, "grad_norm": 0.13465307766256823, "learning_rate": 1.4566793234380395e-06, "loss": 0.4389, "step": 3008 }, { "epoch": 2.804287045666356, "grad_norm": 0.13067020086465256, "learning_rate": 1.4497756299620296e-06, "loss": 0.4122, "step": 3009 }, { "epoch": 2.805219012115564, "grad_norm": 0.13526615178157564, "learning_rate": 1.4428719364860203e-06, "loss": 0.4369, "step": 3010 }, { "epoch": 2.8061509785647716, "grad_norm": 0.13926001771799948, "learning_rate": 1.4359682430100104e-06, "loss": 0.4252, "step": 3011 }, { "epoch": 2.8070829450139794, "grad_norm": 0.12905553281333854, "learning_rate": 1.4290645495340007e-06, "loss": 0.4256, "step": 3012 }, { "epoch": 2.8080149114631876, "grad_norm": 0.13478213549871107, "learning_rate": 1.422160856057991e-06, "loss": 0.4244, "step": 3013 }, { "epoch": 2.808946877912395, "grad_norm": 0.13814956699564174, "learning_rate": 1.4152571625819814e-06, "loss": 0.4389, "step": 3014 }, { "epoch": 2.809878844361603, "grad_norm": 0.13569626383630568, "learning_rate": 1.4083534691059719e-06, "loss": 0.4491, "step": 3015 }, { "epoch": 2.810810810810811, "grad_norm": 0.13344481787375925, "learning_rate": 1.4014497756299622e-06, "loss": 0.446, "step": 3016 }, { "epoch": 2.8117427772600188, "grad_norm": 0.12673456864836205, "learning_rate": 1.3945460821539525e-06, "loss": 0.4085, "step": 3017 }, { "epoch": 2.8126747437092265, "grad_norm": 0.13693578732214215, "learning_rate": 1.3876423886779428e-06, "loss": 0.4363, "step": 3018 }, { "epoch": 2.8136067101584343, "grad_norm": 0.1339537525266387, "learning_rate": 1.3807386952019331e-06, "loss": 0.4384, "step": 3019 }, { "epoch": 2.814538676607642, "grad_norm": 0.14529784717893934, "learning_rate": 1.3738350017259236e-06, "loss": 0.4474, "step": 3020 }, { "epoch": 2.81547064305685, "grad_norm": 0.14292840261886627, "learning_rate": 1.366931308249914e-06, "loss": 0.4602, "step": 3021 }, { "epoch": 2.8164026095060577, "grad_norm": 0.14081817840334623, "learning_rate": 1.3600276147739043e-06, "loss": 0.4564, "step": 3022 }, { "epoch": 2.8173345759552655, "grad_norm": 0.13297394106513835, "learning_rate": 1.3531239212978944e-06, "loss": 0.4342, "step": 3023 }, { "epoch": 2.8182665424044733, "grad_norm": 0.13369770449263138, "learning_rate": 1.3462202278218847e-06, "loss": 0.4511, "step": 3024 }, { "epoch": 2.819198508853681, "grad_norm": 0.13600957944170844, "learning_rate": 1.3393165343458752e-06, "loss": 0.4571, "step": 3025 }, { "epoch": 2.8201304753028893, "grad_norm": 0.13794522575668033, "learning_rate": 1.3324128408698655e-06, "loss": 0.4672, "step": 3026 }, { "epoch": 2.821062441752097, "grad_norm": 0.13382651627625045, "learning_rate": 1.3255091473938558e-06, "loss": 0.4451, "step": 3027 }, { "epoch": 2.821994408201305, "grad_norm": 0.15156624787516998, "learning_rate": 1.3186054539178461e-06, "loss": 0.4932, "step": 3028 }, { "epoch": 2.8229263746505127, "grad_norm": 0.13923992105172098, "learning_rate": 1.3117017604418364e-06, "loss": 0.4497, "step": 3029 }, { "epoch": 2.8238583410997204, "grad_norm": 0.12984244294621997, "learning_rate": 1.304798066965827e-06, "loss": 0.4274, "step": 3030 }, { "epoch": 2.8247903075489282, "grad_norm": 0.14101589432995612, "learning_rate": 1.2978943734898173e-06, "loss": 0.4536, "step": 3031 }, { "epoch": 2.825722273998136, "grad_norm": 0.1399949431371385, "learning_rate": 1.2909906800138076e-06, "loss": 0.4444, "step": 3032 }, { "epoch": 2.826654240447344, "grad_norm": 0.12997972653358017, "learning_rate": 1.2840869865377979e-06, "loss": 0.4377, "step": 3033 }, { "epoch": 2.8275862068965516, "grad_norm": 0.13106399625127377, "learning_rate": 1.277183293061788e-06, "loss": 0.4342, "step": 3034 }, { "epoch": 2.8285181733457594, "grad_norm": 0.13788481012826878, "learning_rate": 1.2702795995857787e-06, "loss": 0.4649, "step": 3035 }, { "epoch": 2.829450139794967, "grad_norm": 0.12746481989128416, "learning_rate": 1.2633759061097688e-06, "loss": 0.4199, "step": 3036 }, { "epoch": 2.8303821062441754, "grad_norm": 0.13704742144109255, "learning_rate": 1.2564722126337591e-06, "loss": 0.4488, "step": 3037 }, { "epoch": 2.831314072693383, "grad_norm": 0.13416902987934237, "learning_rate": 1.2495685191577494e-06, "loss": 0.4513, "step": 3038 }, { "epoch": 2.832246039142591, "grad_norm": 0.1350579664346582, "learning_rate": 1.24266482568174e-06, "loss": 0.4524, "step": 3039 }, { "epoch": 2.8331780055917988, "grad_norm": 0.12964557251152645, "learning_rate": 1.2357611322057302e-06, "loss": 0.4253, "step": 3040 }, { "epoch": 2.8341099720410066, "grad_norm": 0.12931072140887967, "learning_rate": 1.2288574387297203e-06, "loss": 0.4204, "step": 3041 }, { "epoch": 2.8350419384902144, "grad_norm": 0.13221695620205398, "learning_rate": 1.2219537452537109e-06, "loss": 0.4327, "step": 3042 }, { "epoch": 2.835973904939422, "grad_norm": 0.13987217713990924, "learning_rate": 1.2150500517777012e-06, "loss": 0.4485, "step": 3043 }, { "epoch": 2.83690587138863, "grad_norm": 0.13442873807897351, "learning_rate": 1.2081463583016915e-06, "loss": 0.4422, "step": 3044 }, { "epoch": 2.8378378378378377, "grad_norm": 0.13233642875739882, "learning_rate": 1.2012426648256818e-06, "loss": 0.442, "step": 3045 }, { "epoch": 2.8387698042870455, "grad_norm": 0.13313253206864542, "learning_rate": 1.194338971349672e-06, "loss": 0.4436, "step": 3046 }, { "epoch": 2.8397017707362533, "grad_norm": 0.12953953829699355, "learning_rate": 1.1874352778736626e-06, "loss": 0.4349, "step": 3047 }, { "epoch": 2.8406337371854615, "grad_norm": 0.13067472405332436, "learning_rate": 1.1805315843976527e-06, "loss": 0.4265, "step": 3048 }, { "epoch": 2.8415657036346693, "grad_norm": 0.1385130502848568, "learning_rate": 1.1736278909216432e-06, "loss": 0.4808, "step": 3049 }, { "epoch": 2.842497670083877, "grad_norm": 0.1424107247773715, "learning_rate": 1.1667241974456335e-06, "loss": 0.4459, "step": 3050 }, { "epoch": 2.843429636533085, "grad_norm": 0.13339165189507335, "learning_rate": 1.1598205039696239e-06, "loss": 0.4369, "step": 3051 }, { "epoch": 2.8443616029822927, "grad_norm": 0.13713764152162236, "learning_rate": 1.1529168104936142e-06, "loss": 0.4603, "step": 3052 }, { "epoch": 2.8452935694315005, "grad_norm": 0.1287027411872022, "learning_rate": 1.1460131170176045e-06, "loss": 0.4193, "step": 3053 }, { "epoch": 2.8462255358807083, "grad_norm": 0.12996331534848013, "learning_rate": 1.139109423541595e-06, "loss": 0.4374, "step": 3054 }, { "epoch": 2.847157502329916, "grad_norm": 0.1414469666804456, "learning_rate": 1.132205730065585e-06, "loss": 0.4598, "step": 3055 }, { "epoch": 2.848089468779124, "grad_norm": 0.14079928497109645, "learning_rate": 1.1253020365895754e-06, "loss": 0.4452, "step": 3056 }, { "epoch": 2.8490214352283316, "grad_norm": 0.1327671879693014, "learning_rate": 1.118398343113566e-06, "loss": 0.4405, "step": 3057 }, { "epoch": 2.8499534016775394, "grad_norm": 0.13352542910018772, "learning_rate": 1.1114946496375562e-06, "loss": 0.4385, "step": 3058 }, { "epoch": 2.8508853681267476, "grad_norm": 0.13287479240183503, "learning_rate": 1.1045909561615465e-06, "loss": 0.4435, "step": 3059 }, { "epoch": 2.8518173345759554, "grad_norm": 0.13004855313842284, "learning_rate": 1.0976872626855368e-06, "loss": 0.4266, "step": 3060 }, { "epoch": 2.852749301025163, "grad_norm": 0.13048351578091213, "learning_rate": 1.0907835692095272e-06, "loss": 0.4403, "step": 3061 }, { "epoch": 2.853681267474371, "grad_norm": 0.13028164469043957, "learning_rate": 1.0838798757335175e-06, "loss": 0.4314, "step": 3062 }, { "epoch": 2.854613233923579, "grad_norm": 0.13766022602140748, "learning_rate": 1.0769761822575078e-06, "loss": 0.4442, "step": 3063 }, { "epoch": 2.8555452003727866, "grad_norm": 0.13527919827724394, "learning_rate": 1.0700724887814983e-06, "loss": 0.4524, "step": 3064 }, { "epoch": 2.8564771668219944, "grad_norm": 0.13259649205264668, "learning_rate": 1.0631687953054886e-06, "loss": 0.4345, "step": 3065 }, { "epoch": 2.857409133271202, "grad_norm": 0.14794287964865965, "learning_rate": 1.0562651018294787e-06, "loss": 0.4589, "step": 3066 }, { "epoch": 2.85834109972041, "grad_norm": 0.13099322951491868, "learning_rate": 1.0493614083534692e-06, "loss": 0.4299, "step": 3067 }, { "epoch": 2.8592730661696177, "grad_norm": 0.1343064334824238, "learning_rate": 1.0424577148774595e-06, "loss": 0.4254, "step": 3068 }, { "epoch": 2.8602050326188255, "grad_norm": 0.13792166470022513, "learning_rate": 1.0355540214014498e-06, "loss": 0.4481, "step": 3069 }, { "epoch": 2.8611369990680338, "grad_norm": 0.1445949457632022, "learning_rate": 1.0286503279254402e-06, "loss": 0.4389, "step": 3070 }, { "epoch": 2.862068965517241, "grad_norm": 0.12944069797537439, "learning_rate": 1.0217466344494305e-06, "loss": 0.4383, "step": 3071 }, { "epoch": 2.8630009319664493, "grad_norm": 0.141157290548686, "learning_rate": 1.014842940973421e-06, "loss": 0.459, "step": 3072 }, { "epoch": 2.863932898415657, "grad_norm": 0.1317934711924507, "learning_rate": 1.007939247497411e-06, "loss": 0.4394, "step": 3073 }, { "epoch": 2.864864864864865, "grad_norm": 0.1332308524826816, "learning_rate": 1.0010355540214016e-06, "loss": 0.444, "step": 3074 }, { "epoch": 2.8657968313140727, "grad_norm": 0.13426342354175977, "learning_rate": 9.94131860545392e-07, "loss": 0.4424, "step": 3075 }, { "epoch": 2.8667287977632805, "grad_norm": 0.14025333015664337, "learning_rate": 9.872281670693822e-07, "loss": 0.4394, "step": 3076 }, { "epoch": 2.8676607642124883, "grad_norm": 0.13218360149361974, "learning_rate": 9.803244735933725e-07, "loss": 0.4418, "step": 3077 }, { "epoch": 2.868592730661696, "grad_norm": 0.13230997794431684, "learning_rate": 9.734207801173628e-07, "loss": 0.444, "step": 3078 }, { "epoch": 2.869524697110904, "grad_norm": 0.14242170338290397, "learning_rate": 9.665170866413534e-07, "loss": 0.4648, "step": 3079 }, { "epoch": 2.8704566635601116, "grad_norm": 0.13728540795620792, "learning_rate": 9.596133931653435e-07, "loss": 0.4557, "step": 3080 }, { "epoch": 2.87138863000932, "grad_norm": 0.1396398136227473, "learning_rate": 9.527096996893339e-07, "loss": 0.4489, "step": 3081 }, { "epoch": 2.872320596458527, "grad_norm": 0.13135186099181284, "learning_rate": 9.458060062133243e-07, "loss": 0.4342, "step": 3082 }, { "epoch": 2.8732525629077355, "grad_norm": 0.1309791918541454, "learning_rate": 9.389023127373145e-07, "loss": 0.4175, "step": 3083 }, { "epoch": 2.8741845293569432, "grad_norm": 0.1329060647123516, "learning_rate": 9.319986192613049e-07, "loss": 0.4469, "step": 3084 }, { "epoch": 2.875116495806151, "grad_norm": 0.132859443547529, "learning_rate": 9.250949257852952e-07, "loss": 0.4176, "step": 3085 }, { "epoch": 2.876048462255359, "grad_norm": 0.13332001392244333, "learning_rate": 9.181912323092855e-07, "loss": 0.4159, "step": 3086 }, { "epoch": 2.8769804287045666, "grad_norm": 0.1354673897215564, "learning_rate": 9.112875388332759e-07, "loss": 0.4587, "step": 3087 }, { "epoch": 2.8779123951537744, "grad_norm": 0.13349334446877123, "learning_rate": 9.043838453572662e-07, "loss": 0.4249, "step": 3088 }, { "epoch": 2.878844361602982, "grad_norm": 0.13353910365388408, "learning_rate": 8.974801518812564e-07, "loss": 0.4436, "step": 3089 }, { "epoch": 2.87977632805219, "grad_norm": 0.132969788035215, "learning_rate": 8.905764584052469e-07, "loss": 0.4541, "step": 3090 }, { "epoch": 2.8807082945013978, "grad_norm": 0.1314285939320381, "learning_rate": 8.836727649292372e-07, "loss": 0.4368, "step": 3091 }, { "epoch": 2.881640260950606, "grad_norm": 0.14107836235930798, "learning_rate": 8.767690714532276e-07, "loss": 0.4366, "step": 3092 }, { "epoch": 2.8825722273998133, "grad_norm": 0.1364373161296766, "learning_rate": 8.698653779772179e-07, "loss": 0.4502, "step": 3093 }, { "epoch": 2.8835041938490216, "grad_norm": 0.1363552789161827, "learning_rate": 8.629616845012082e-07, "loss": 0.4468, "step": 3094 }, { "epoch": 2.8844361602982294, "grad_norm": 0.12733954058424723, "learning_rate": 8.560579910251986e-07, "loss": 0.4241, "step": 3095 }, { "epoch": 2.885368126747437, "grad_norm": 0.13720731877407968, "learning_rate": 8.491542975491888e-07, "loss": 0.4538, "step": 3096 }, { "epoch": 2.886300093196645, "grad_norm": 0.14068261161154733, "learning_rate": 8.422506040731792e-07, "loss": 0.4581, "step": 3097 }, { "epoch": 2.8872320596458527, "grad_norm": 0.14315942063938916, "learning_rate": 8.353469105971695e-07, "loss": 0.4567, "step": 3098 }, { "epoch": 2.8881640260950605, "grad_norm": 0.13545686544434213, "learning_rate": 8.284432171211599e-07, "loss": 0.428, "step": 3099 }, { "epoch": 2.8890959925442683, "grad_norm": 0.13248242049652875, "learning_rate": 8.215395236451503e-07, "loss": 0.4385, "step": 3100 }, { "epoch": 2.890027958993476, "grad_norm": 0.12771289221805407, "learning_rate": 8.146358301691405e-07, "loss": 0.4218, "step": 3101 }, { "epoch": 2.890959925442684, "grad_norm": 0.1324299042538949, "learning_rate": 8.07732136693131e-07, "loss": 0.4295, "step": 3102 }, { "epoch": 2.891891891891892, "grad_norm": 0.1326514565381079, "learning_rate": 8.008284432171212e-07, "loss": 0.4249, "step": 3103 }, { "epoch": 2.8928238583410995, "grad_norm": 0.12965301105605148, "learning_rate": 7.939247497411115e-07, "loss": 0.4223, "step": 3104 }, { "epoch": 2.8937558247903077, "grad_norm": 0.13404186917416513, "learning_rate": 7.870210562651019e-07, "loss": 0.4603, "step": 3105 }, { "epoch": 2.8946877912395155, "grad_norm": 0.1385945037755199, "learning_rate": 7.801173627890922e-07, "loss": 0.4499, "step": 3106 }, { "epoch": 2.8956197576887233, "grad_norm": 0.13493385222748994, "learning_rate": 7.732136693130826e-07, "loss": 0.4362, "step": 3107 }, { "epoch": 2.896551724137931, "grad_norm": 0.13165760889073086, "learning_rate": 7.663099758370728e-07, "loss": 0.4632, "step": 3108 }, { "epoch": 2.897483690587139, "grad_norm": 0.13445070764497682, "learning_rate": 7.594062823610632e-07, "loss": 0.465, "step": 3109 }, { "epoch": 2.8984156570363466, "grad_norm": 0.13414988129136315, "learning_rate": 7.525025888850536e-07, "loss": 0.4482, "step": 3110 }, { "epoch": 2.8993476234855544, "grad_norm": 0.1324377467775817, "learning_rate": 7.455988954090439e-07, "loss": 0.4406, "step": 3111 }, { "epoch": 2.900279589934762, "grad_norm": 0.13733639794244704, "learning_rate": 7.386952019330343e-07, "loss": 0.4317, "step": 3112 }, { "epoch": 2.90121155638397, "grad_norm": 0.1375811995346765, "learning_rate": 7.317915084570246e-07, "loss": 0.4387, "step": 3113 }, { "epoch": 2.9021435228331782, "grad_norm": 0.12797297253844217, "learning_rate": 7.248878149810148e-07, "loss": 0.395, "step": 3114 }, { "epoch": 2.9030754892823856, "grad_norm": 0.13206540454690954, "learning_rate": 7.179841215050052e-07, "loss": 0.4448, "step": 3115 }, { "epoch": 2.904007455731594, "grad_norm": 0.13176968716615825, "learning_rate": 7.110804280289955e-07, "loss": 0.4457, "step": 3116 }, { "epoch": 2.9049394221808016, "grad_norm": 0.1467122357234298, "learning_rate": 7.041767345529859e-07, "loss": 0.4463, "step": 3117 }, { "epoch": 2.9058713886300094, "grad_norm": 0.17774035065038754, "learning_rate": 6.972730410769763e-07, "loss": 0.4397, "step": 3118 }, { "epoch": 2.906803355079217, "grad_norm": 0.1334729105962817, "learning_rate": 6.903693476009666e-07, "loss": 0.4395, "step": 3119 }, { "epoch": 2.907735321528425, "grad_norm": 0.13321682613196922, "learning_rate": 6.83465654124957e-07, "loss": 0.4506, "step": 3120 }, { "epoch": 2.9086672879776327, "grad_norm": 0.12857701201161328, "learning_rate": 6.765619606489472e-07, "loss": 0.4248, "step": 3121 }, { "epoch": 2.9095992544268405, "grad_norm": 0.13385233857396492, "learning_rate": 6.696582671729376e-07, "loss": 0.4477, "step": 3122 }, { "epoch": 2.9105312208760483, "grad_norm": 0.14021068516408666, "learning_rate": 6.627545736969279e-07, "loss": 0.4343, "step": 3123 }, { "epoch": 2.911463187325256, "grad_norm": 0.13264483462195273, "learning_rate": 6.558508802209182e-07, "loss": 0.4211, "step": 3124 }, { "epoch": 2.9123951537744643, "grad_norm": 0.12994646130624116, "learning_rate": 6.489471867449086e-07, "loss": 0.4325, "step": 3125 }, { "epoch": 2.9133271202236717, "grad_norm": 0.12636546009452077, "learning_rate": 6.420434932688989e-07, "loss": 0.4156, "step": 3126 }, { "epoch": 2.91425908667288, "grad_norm": 0.12753699152079573, "learning_rate": 6.351397997928894e-07, "loss": 0.4223, "step": 3127 }, { "epoch": 2.9151910531220877, "grad_norm": 0.13153640181241663, "learning_rate": 6.282361063168796e-07, "loss": 0.424, "step": 3128 }, { "epoch": 2.9161230195712955, "grad_norm": 0.1280662882081996, "learning_rate": 6.2133241284087e-07, "loss": 0.4332, "step": 3129 }, { "epoch": 2.9170549860205033, "grad_norm": 0.14146550315661408, "learning_rate": 6.144287193648602e-07, "loss": 0.4588, "step": 3130 }, { "epoch": 2.917986952469711, "grad_norm": 0.13111059623839877, "learning_rate": 6.075250258888506e-07, "loss": 0.4139, "step": 3131 }, { "epoch": 2.918918918918919, "grad_norm": 0.13751992458005774, "learning_rate": 6.006213324128409e-07, "loss": 0.4636, "step": 3132 }, { "epoch": 2.9198508853681266, "grad_norm": 0.13902770711086382, "learning_rate": 5.937176389368313e-07, "loss": 0.4467, "step": 3133 }, { "epoch": 2.9207828518173344, "grad_norm": 0.12931991364389567, "learning_rate": 5.868139454608216e-07, "loss": 0.442, "step": 3134 }, { "epoch": 2.9217148182665422, "grad_norm": 0.13340920065233616, "learning_rate": 5.799102519848119e-07, "loss": 0.452, "step": 3135 }, { "epoch": 2.9226467847157505, "grad_norm": 0.1274232581147795, "learning_rate": 5.730065585088022e-07, "loss": 0.4325, "step": 3136 }, { "epoch": 2.923578751164958, "grad_norm": 0.1257802845569154, "learning_rate": 5.661028650327925e-07, "loss": 0.4232, "step": 3137 }, { "epoch": 2.924510717614166, "grad_norm": 0.13170012000177336, "learning_rate": 5.59199171556783e-07, "loss": 0.4301, "step": 3138 }, { "epoch": 2.925442684063374, "grad_norm": 0.12885204791054045, "learning_rate": 5.522954780807733e-07, "loss": 0.4232, "step": 3139 }, { "epoch": 2.9263746505125816, "grad_norm": 0.13522131879767235, "learning_rate": 5.453917846047636e-07, "loss": 0.4455, "step": 3140 }, { "epoch": 2.9273066169617894, "grad_norm": 0.1291011440737858, "learning_rate": 5.384880911287539e-07, "loss": 0.4148, "step": 3141 }, { "epoch": 2.928238583410997, "grad_norm": 0.13073915721404633, "learning_rate": 5.315843976527443e-07, "loss": 0.4373, "step": 3142 }, { "epoch": 2.929170549860205, "grad_norm": 0.13542179645531277, "learning_rate": 5.246807041767346e-07, "loss": 0.4514, "step": 3143 }, { "epoch": 2.9301025163094128, "grad_norm": 0.13669490039808033, "learning_rate": 5.177770107007249e-07, "loss": 0.4809, "step": 3144 }, { "epoch": 2.9310344827586206, "grad_norm": 0.12880169555657803, "learning_rate": 5.108733172247152e-07, "loss": 0.4474, "step": 3145 }, { "epoch": 2.9319664492078283, "grad_norm": 0.1347923389291226, "learning_rate": 5.039696237487055e-07, "loss": 0.4753, "step": 3146 }, { "epoch": 2.9328984156570366, "grad_norm": 0.1350332321790718, "learning_rate": 4.97065930272696e-07, "loss": 0.4568, "step": 3147 }, { "epoch": 2.933830382106244, "grad_norm": 0.13141129670703014, "learning_rate": 4.901622367966863e-07, "loss": 0.4295, "step": 3148 }, { "epoch": 2.934762348555452, "grad_norm": 0.14047556024013663, "learning_rate": 4.832585433206767e-07, "loss": 0.4639, "step": 3149 }, { "epoch": 2.93569431500466, "grad_norm": 0.131339256409987, "learning_rate": 4.7635484984466693e-07, "loss": 0.4313, "step": 3150 }, { "epoch": 2.9366262814538677, "grad_norm": 0.1320887930980276, "learning_rate": 4.6945115636865724e-07, "loss": 0.4436, "step": 3151 }, { "epoch": 2.9375582479030755, "grad_norm": 0.133334571174382, "learning_rate": 4.625474628926476e-07, "loss": 0.4628, "step": 3152 }, { "epoch": 2.9384902143522833, "grad_norm": 0.12757548634232935, "learning_rate": 4.5564376941663797e-07, "loss": 0.4251, "step": 3153 }, { "epoch": 2.939422180801491, "grad_norm": 0.13429690507080816, "learning_rate": 4.487400759406282e-07, "loss": 0.4655, "step": 3154 }, { "epoch": 2.940354147250699, "grad_norm": 0.13857589557693598, "learning_rate": 4.418363824646186e-07, "loss": 0.4471, "step": 3155 }, { "epoch": 2.9412861136999067, "grad_norm": 0.13097866264554325, "learning_rate": 4.3493268898860895e-07, "loss": 0.4383, "step": 3156 }, { "epoch": 2.9422180801491145, "grad_norm": 0.1314895153747028, "learning_rate": 4.280289955125993e-07, "loss": 0.4615, "step": 3157 }, { "epoch": 2.9431500465983227, "grad_norm": 0.13325445055435522, "learning_rate": 4.211253020365896e-07, "loss": 0.4373, "step": 3158 }, { "epoch": 2.94408201304753, "grad_norm": 0.12954003487739407, "learning_rate": 4.142216085605799e-07, "loss": 0.4268, "step": 3159 }, { "epoch": 2.9450139794967383, "grad_norm": 0.1375005057853076, "learning_rate": 4.0731791508457024e-07, "loss": 0.4512, "step": 3160 }, { "epoch": 2.945945945945946, "grad_norm": 0.13119139716670636, "learning_rate": 4.004142216085606e-07, "loss": 0.4308, "step": 3161 }, { "epoch": 2.946877912395154, "grad_norm": 0.13214841289118445, "learning_rate": 3.9351052813255096e-07, "loss": 0.4262, "step": 3162 }, { "epoch": 2.9478098788443616, "grad_norm": 0.13564784419733958, "learning_rate": 3.866068346565413e-07, "loss": 0.4616, "step": 3163 }, { "epoch": 2.9487418452935694, "grad_norm": 0.13459646398816139, "learning_rate": 3.797031411805316e-07, "loss": 0.4263, "step": 3164 }, { "epoch": 2.949673811742777, "grad_norm": 0.1310543916288025, "learning_rate": 3.7279944770452194e-07, "loss": 0.4324, "step": 3165 }, { "epoch": 2.950605778191985, "grad_norm": 0.13286643567907663, "learning_rate": 3.658957542285123e-07, "loss": 0.4318, "step": 3166 }, { "epoch": 2.951537744641193, "grad_norm": 0.13320603595604388, "learning_rate": 3.589920607525026e-07, "loss": 0.4425, "step": 3167 }, { "epoch": 2.9524697110904006, "grad_norm": 0.13703522959400047, "learning_rate": 3.5208836727649297e-07, "loss": 0.4667, "step": 3168 }, { "epoch": 2.953401677539609, "grad_norm": 0.12938066328898268, "learning_rate": 3.451846738004833e-07, "loss": 0.4426, "step": 3169 }, { "epoch": 2.954333643988816, "grad_norm": 0.13140451383382634, "learning_rate": 3.382809803244736e-07, "loss": 0.4463, "step": 3170 }, { "epoch": 2.9552656104380244, "grad_norm": 0.14041242484163932, "learning_rate": 3.3137728684846395e-07, "loss": 0.4628, "step": 3171 }, { "epoch": 2.956197576887232, "grad_norm": 0.1340698784210778, "learning_rate": 3.244735933724543e-07, "loss": 0.4492, "step": 3172 }, { "epoch": 2.95712954333644, "grad_norm": 0.13951001687303882, "learning_rate": 3.175698998964447e-07, "loss": 0.4635, "step": 3173 }, { "epoch": 2.9580615097856477, "grad_norm": 0.12328239164000306, "learning_rate": 3.10666206420435e-07, "loss": 0.4125, "step": 3174 }, { "epoch": 2.9589934762348555, "grad_norm": 0.13196805881420703, "learning_rate": 3.037625129444253e-07, "loss": 0.446, "step": 3175 }, { "epoch": 2.9599254426840633, "grad_norm": 0.13067688057224477, "learning_rate": 2.9685881946841566e-07, "loss": 0.4297, "step": 3176 }, { "epoch": 2.960857409133271, "grad_norm": 0.13980141953402753, "learning_rate": 2.8995512599240596e-07, "loss": 0.4661, "step": 3177 }, { "epoch": 2.961789375582479, "grad_norm": 0.13062437017346237, "learning_rate": 2.8305143251639627e-07, "loss": 0.4364, "step": 3178 }, { "epoch": 2.9627213420316867, "grad_norm": 0.1305301476788854, "learning_rate": 2.7614773904038664e-07, "loss": 0.4395, "step": 3179 }, { "epoch": 2.963653308480895, "grad_norm": 0.13381201836096263, "learning_rate": 2.6924404556437694e-07, "loss": 0.4589, "step": 3180 }, { "epoch": 2.9645852749301023, "grad_norm": 0.1333918794860028, "learning_rate": 2.623403520883673e-07, "loss": 0.4297, "step": 3181 }, { "epoch": 2.9655172413793105, "grad_norm": 0.13307447419737933, "learning_rate": 2.554366586123576e-07, "loss": 0.4429, "step": 3182 }, { "epoch": 2.9664492078285183, "grad_norm": 0.13804987315529108, "learning_rate": 2.48532965136348e-07, "loss": 0.4516, "step": 3183 }, { "epoch": 2.967381174277726, "grad_norm": 0.1334618430218174, "learning_rate": 2.4162927166033834e-07, "loss": 0.4462, "step": 3184 }, { "epoch": 2.968313140726934, "grad_norm": 0.1306884540540982, "learning_rate": 2.3472557818432862e-07, "loss": 0.4257, "step": 3185 }, { "epoch": 2.9692451071761417, "grad_norm": 0.13386908388705385, "learning_rate": 2.2782188470831898e-07, "loss": 0.4339, "step": 3186 }, { "epoch": 2.9701770736253494, "grad_norm": 0.13204905890157134, "learning_rate": 2.209181912323093e-07, "loss": 0.4345, "step": 3187 }, { "epoch": 2.9711090400745572, "grad_norm": 0.1391687233694688, "learning_rate": 2.1401449775629965e-07, "loss": 0.434, "step": 3188 }, { "epoch": 2.972041006523765, "grad_norm": 0.13407456207592736, "learning_rate": 2.0711080428028996e-07, "loss": 0.4486, "step": 3189 }, { "epoch": 2.972972972972973, "grad_norm": 0.12969529785178258, "learning_rate": 2.002071108042803e-07, "loss": 0.4311, "step": 3190 }, { "epoch": 2.973904939422181, "grad_norm": 0.13616192294871693, "learning_rate": 1.9330341732827066e-07, "loss": 0.4459, "step": 3191 }, { "epoch": 2.9748369058713884, "grad_norm": 0.1362526014552454, "learning_rate": 1.8639972385226097e-07, "loss": 0.4456, "step": 3192 }, { "epoch": 2.9757688723205966, "grad_norm": 0.13424685091528737, "learning_rate": 1.794960303762513e-07, "loss": 0.4338, "step": 3193 }, { "epoch": 2.9767008387698044, "grad_norm": 0.13592076915939935, "learning_rate": 1.7259233690024164e-07, "loss": 0.4509, "step": 3194 }, { "epoch": 2.977632805219012, "grad_norm": 0.13245116726508435, "learning_rate": 1.6568864342423198e-07, "loss": 0.4365, "step": 3195 }, { "epoch": 2.97856477166822, "grad_norm": 0.13482239106210497, "learning_rate": 1.5878494994822234e-07, "loss": 0.4496, "step": 3196 }, { "epoch": 2.9794967381174278, "grad_norm": 0.13108161004946503, "learning_rate": 1.5188125647221265e-07, "loss": 0.4331, "step": 3197 }, { "epoch": 2.9804287045666356, "grad_norm": 0.13814908479707994, "learning_rate": 1.4497756299620298e-07, "loss": 0.456, "step": 3198 }, { "epoch": 2.9813606710158433, "grad_norm": 0.1328345634702037, "learning_rate": 1.3807386952019332e-07, "loss": 0.4682, "step": 3199 }, { "epoch": 2.982292637465051, "grad_norm": 0.13552040452941883, "learning_rate": 1.3117017604418365e-07, "loss": 0.4456, "step": 3200 }, { "epoch": 2.983224603914259, "grad_norm": 0.13967062863930277, "learning_rate": 1.24266482568174e-07, "loss": 0.4495, "step": 3201 }, { "epoch": 2.984156570363467, "grad_norm": 0.13197879242682095, "learning_rate": 1.1736278909216431e-07, "loss": 0.4291, "step": 3202 }, { "epoch": 2.9850885368126745, "grad_norm": 0.14085959681586344, "learning_rate": 1.1045909561615465e-07, "loss": 0.4342, "step": 3203 }, { "epoch": 2.9860205032618827, "grad_norm": 0.13275185810175594, "learning_rate": 1.0355540214014498e-07, "loss": 0.4532, "step": 3204 }, { "epoch": 2.9869524697110905, "grad_norm": 0.13438161193562823, "learning_rate": 9.665170866413533e-08, "loss": 0.4338, "step": 3205 }, { "epoch": 2.9878844361602983, "grad_norm": 0.13542691853521674, "learning_rate": 8.974801518812565e-08, "loss": 0.4354, "step": 3206 }, { "epoch": 2.988816402609506, "grad_norm": 0.13225691296799275, "learning_rate": 8.284432171211599e-08, "loss": 0.4715, "step": 3207 }, { "epoch": 2.989748369058714, "grad_norm": 0.12766081720613895, "learning_rate": 7.594062823610632e-08, "loss": 0.4111, "step": 3208 }, { "epoch": 2.9906803355079217, "grad_norm": 0.13574206886354784, "learning_rate": 6.903693476009666e-08, "loss": 0.4562, "step": 3209 }, { "epoch": 2.9916123019571295, "grad_norm": 0.133343346500648, "learning_rate": 6.2133241284087e-08, "loss": 0.4365, "step": 3210 }, { "epoch": 2.9925442684063372, "grad_norm": 0.12948465990092325, "learning_rate": 5.522954780807732e-08, "loss": 0.4425, "step": 3211 }, { "epoch": 2.993476234855545, "grad_norm": 0.1362464158435248, "learning_rate": 4.8325854332067665e-08, "loss": 0.4604, "step": 3212 }, { "epoch": 2.9944082013047533, "grad_norm": 0.1306446248143371, "learning_rate": 4.1422160856057994e-08, "loss": 0.4468, "step": 3213 }, { "epoch": 2.9953401677539606, "grad_norm": 0.13176464429792517, "learning_rate": 3.451846738004833e-08, "loss": 0.457, "step": 3214 }, { "epoch": 2.996272134203169, "grad_norm": 0.13119055423566733, "learning_rate": 2.761477390403866e-08, "loss": 0.4379, "step": 3215 }, { "epoch": 2.9972041006523766, "grad_norm": 0.1415027875834078, "learning_rate": 2.0711080428028997e-08, "loss": 0.452, "step": 3216 }, { "epoch": 2.9981360671015844, "grad_norm": 0.13410257031776446, "learning_rate": 1.380738695201933e-08, "loss": 0.447, "step": 3217 }, { "epoch": 2.999068033550792, "grad_norm": 0.12951701731376858, "learning_rate": 6.903693476009665e-09, "loss": 0.4205, "step": 3218 }, { "epoch": 3.0, "grad_norm": 0.13486197053945453, "learning_rate": 0.0, "loss": 0.4616, "step": 3219 }, { "epoch": 3.0, "step": 3219, "total_flos": 1467231158403072.0, "train_loss": 0.0, "train_runtime": 0.7869, "train_samples_per_second": 65449.528, "train_steps_per_second": 4090.596 } ], "logging_steps": 1, "max_steps": 3219, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1467231158403072.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }