{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002564102564102564, "grad_norm": 0.13979803025722504, "learning_rate": 2.564102564102564e-07, "loss": 0.9999, "step": 1 }, { "epoch": 0.005128205128205128, "grad_norm": 0.15988187491893768, "learning_rate": 5.128205128205128e-07, "loss": 1.0694, "step": 2 }, { "epoch": 0.007692307692307693, "grad_norm": 0.1669122576713562, "learning_rate": 7.692307692307694e-07, "loss": 1.1319, "step": 3 }, { "epoch": 0.010256410256410256, "grad_norm": 0.15094897150993347, "learning_rate": 1.0256410256410257e-06, "loss": 0.956, "step": 4 }, { "epoch": 0.01282051282051282, "grad_norm": 0.15886467695236206, "learning_rate": 1.2820512820512822e-06, "loss": 1.2478, "step": 5 }, { "epoch": 0.015384615384615385, "grad_norm": 0.13728941977024078, "learning_rate": 1.5384615384615387e-06, "loss": 1.0633, "step": 6 }, { "epoch": 0.017948717948717947, "grad_norm": 0.14830218255519867, "learning_rate": 1.794871794871795e-06, "loss": 0.9022, "step": 7 }, { "epoch": 0.020512820512820513, "grad_norm": 0.17115703225135803, "learning_rate": 2.0512820512820513e-06, "loss": 1.1275, "step": 8 }, { "epoch": 0.023076923076923078, "grad_norm": 0.15854498744010925, "learning_rate": 2.3076923076923077e-06, "loss": 1.0972, "step": 9 }, { "epoch": 0.02564102564102564, "grad_norm": 0.15408478677272797, "learning_rate": 2.5641025641025644e-06, "loss": 1.0453, "step": 10 }, { "epoch": 0.028205128205128206, "grad_norm": 0.15272922813892365, "learning_rate": 2.8205128205128207e-06, "loss": 1.0399, "step": 11 }, { "epoch": 0.03076923076923077, "grad_norm": 0.14716756343841553, "learning_rate": 3.0769230769230774e-06, "loss": 1.0765, "step": 12 }, { "epoch": 0.03333333333333333, "grad_norm": 0.14600983262062073, "learning_rate": 3.3333333333333337e-06, "loss": 1.0616, "step": 13 }, { "epoch": 0.035897435897435895, "grad_norm": 0.14107345044612885, "learning_rate": 3.58974358974359e-06, "loss": 1.1168, "step": 14 }, { "epoch": 0.038461538461538464, "grad_norm": 0.14267683029174805, "learning_rate": 3.846153846153847e-06, "loss": 1.0462, "step": 15 }, { "epoch": 0.041025641025641026, "grad_norm": 0.1449165940284729, "learning_rate": 4.102564102564103e-06, "loss": 1.1341, "step": 16 }, { "epoch": 0.04358974358974359, "grad_norm": 0.14054569602012634, "learning_rate": 4.358974358974359e-06, "loss": 1.0704, "step": 17 }, { "epoch": 0.046153846153846156, "grad_norm": 0.14372889697551727, "learning_rate": 4.615384615384615e-06, "loss": 1.1511, "step": 18 }, { "epoch": 0.04871794871794872, "grad_norm": 0.13227999210357666, "learning_rate": 4.871794871794872e-06, "loss": 1.009, "step": 19 }, { "epoch": 0.05128205128205128, "grad_norm": 0.15212635695934296, "learning_rate": 5.128205128205129e-06, "loss": 1.1836, "step": 20 }, { "epoch": 0.05384615384615385, "grad_norm": 0.1289321482181549, "learning_rate": 5.384615384615385e-06, "loss": 1.1769, "step": 21 }, { "epoch": 0.05641025641025641, "grad_norm": 0.12752899527549744, "learning_rate": 5.641025641025641e-06, "loss": 1.0759, "step": 22 }, { "epoch": 0.05897435897435897, "grad_norm": 0.12242157757282257, "learning_rate": 5.897435897435897e-06, "loss": 1.0826, "step": 23 }, { "epoch": 0.06153846153846154, "grad_norm": 0.1286514699459076, "learning_rate": 6.153846153846155e-06, "loss": 1.05, "step": 24 }, { "epoch": 0.0641025641025641, "grad_norm": 0.13825258612632751, "learning_rate": 6.410256410256411e-06, "loss": 1.0944, "step": 25 }, { "epoch": 0.06666666666666667, "grad_norm": 0.13136117160320282, "learning_rate": 6.6666666666666675e-06, "loss": 1.0472, "step": 26 }, { "epoch": 0.06923076923076923, "grad_norm": 0.11351772397756577, "learning_rate": 6.923076923076923e-06, "loss": 1.071, "step": 27 }, { "epoch": 0.07179487179487179, "grad_norm": 0.12361160665750504, "learning_rate": 7.17948717948718e-06, "loss": 1.0955, "step": 28 }, { "epoch": 0.07435897435897436, "grad_norm": 0.13740238547325134, "learning_rate": 7.435897435897436e-06, "loss": 1.1061, "step": 29 }, { "epoch": 0.07692307692307693, "grad_norm": 0.11295292526483536, "learning_rate": 7.692307692307694e-06, "loss": 1.0159, "step": 30 }, { "epoch": 0.07948717948717948, "grad_norm": 0.12402593344449997, "learning_rate": 7.948717948717949e-06, "loss": 1.0859, "step": 31 }, { "epoch": 0.08205128205128205, "grad_norm": 0.1193617656826973, "learning_rate": 8.205128205128205e-06, "loss": 1.1554, "step": 32 }, { "epoch": 0.08461538461538462, "grad_norm": 0.12157738953828812, "learning_rate": 8.461538461538462e-06, "loss": 1.0643, "step": 33 }, { "epoch": 0.08717948717948718, "grad_norm": 0.11561132967472076, "learning_rate": 8.717948717948719e-06, "loss": 0.9329, "step": 34 }, { "epoch": 0.08974358974358974, "grad_norm": 0.1199595183134079, "learning_rate": 8.974358974358976e-06, "loss": 0.9432, "step": 35 }, { "epoch": 0.09230769230769231, "grad_norm": 0.27351143956184387, "learning_rate": 9.23076923076923e-06, "loss": 0.9671, "step": 36 }, { "epoch": 0.09487179487179487, "grad_norm": 0.11380849033594131, "learning_rate": 9.487179487179489e-06, "loss": 1.0776, "step": 37 }, { "epoch": 0.09743589743589744, "grad_norm": 0.12026315927505493, "learning_rate": 9.743589743589744e-06, "loss": 0.9837, "step": 38 }, { "epoch": 0.1, "grad_norm": 0.11509953439235687, "learning_rate": 1e-05, "loss": 1.0578, "step": 39 }, { "epoch": 0.10256410256410256, "grad_norm": 0.12789179384708405, "learning_rate": 9.99878394811512e-06, "loss": 1.0436, "step": 40 }, { "epoch": 0.10512820512820513, "grad_norm": 0.10106956213712692, "learning_rate": 9.997564935064936e-06, "loss": 0.934, "step": 41 }, { "epoch": 0.1076923076923077, "grad_norm": 0.11464275419712067, "learning_rate": 9.996342950020318e-06, "loss": 1.0297, "step": 42 }, { "epoch": 0.11025641025641025, "grad_norm": 0.11068426072597504, "learning_rate": 9.995117982099268e-06, "loss": 1.1004, "step": 43 }, { "epoch": 0.11282051282051282, "grad_norm": 0.10913486778736115, "learning_rate": 9.993890020366601e-06, "loss": 0.92, "step": 44 }, { "epoch": 0.11538461538461539, "grad_norm": 0.11235719919204712, "learning_rate": 9.992659053833607e-06, "loss": 1.0534, "step": 45 }, { "epoch": 0.11794871794871795, "grad_norm": 0.10900150239467621, "learning_rate": 9.991425071457738e-06, "loss": 1.0011, "step": 46 }, { "epoch": 0.12051282051282051, "grad_norm": 0.11291161179542542, "learning_rate": 9.990188062142274e-06, "loss": 0.9889, "step": 47 }, { "epoch": 0.12307692307692308, "grad_norm": 0.12270451337099075, "learning_rate": 9.988948014735981e-06, "loss": 1.1178, "step": 48 }, { "epoch": 0.12564102564102564, "grad_norm": 0.109133280813694, "learning_rate": 9.987704918032787e-06, "loss": 1.0422, "step": 49 }, { "epoch": 0.1282051282051282, "grad_norm": 0.11073730140924454, "learning_rate": 9.98645876077144e-06, "loss": 1.1226, "step": 50 }, { "epoch": 0.13076923076923078, "grad_norm": 0.10467839986085892, "learning_rate": 9.98520953163517e-06, "loss": 1.089, "step": 51 }, { "epoch": 0.13333333333333333, "grad_norm": 0.10366383194923401, "learning_rate": 9.983957219251336e-06, "loss": 1.1206, "step": 52 }, { "epoch": 0.1358974358974359, "grad_norm": 0.10720381140708923, "learning_rate": 9.982701812191105e-06, "loss": 1.091, "step": 53 }, { "epoch": 0.13846153846153847, "grad_norm": 0.2592061161994934, "learning_rate": 9.981443298969074e-06, "loss": 0.964, "step": 54 }, { "epoch": 0.14102564102564102, "grad_norm": 0.10395167022943497, "learning_rate": 9.98018166804294e-06, "loss": 1.1305, "step": 55 }, { "epoch": 0.14358974358974358, "grad_norm": 0.10875218361616135, "learning_rate": 9.978916907813147e-06, "loss": 1.1347, "step": 56 }, { "epoch": 0.14615384615384616, "grad_norm": 0.10331016033887863, "learning_rate": 9.977649006622518e-06, "loss": 1.1921, "step": 57 }, { "epoch": 0.14871794871794872, "grad_norm": 0.10478100180625916, "learning_rate": 9.976377952755907e-06, "loss": 1.0031, "step": 58 }, { "epoch": 0.15128205128205127, "grad_norm": 0.09711793065071106, "learning_rate": 9.975103734439834e-06, "loss": 0.9949, "step": 59 }, { "epoch": 0.15384615384615385, "grad_norm": 0.10558706521987915, "learning_rate": 9.973826339842128e-06, "loss": 1.0029, "step": 60 }, { "epoch": 0.1564102564102564, "grad_norm": 0.09912573546171188, "learning_rate": 9.972545757071548e-06, "loss": 0.933, "step": 61 }, { "epoch": 0.15897435897435896, "grad_norm": 0.11607331037521362, "learning_rate": 9.971261974177426e-06, "loss": 0.9942, "step": 62 }, { "epoch": 0.16153846153846155, "grad_norm": 0.10281538218259811, "learning_rate": 9.969974979149292e-06, "loss": 1.0307, "step": 63 }, { "epoch": 0.1641025641025641, "grad_norm": 0.10646649450063705, "learning_rate": 9.968684759916494e-06, "loss": 1.0052, "step": 64 }, { "epoch": 0.16666666666666666, "grad_norm": 0.1755123883485794, "learning_rate": 9.967391304347826e-06, "loss": 0.9506, "step": 65 }, { "epoch": 0.16923076923076924, "grad_norm": 0.11006496846675873, "learning_rate": 9.966094600251151e-06, "loss": 0.9679, "step": 66 }, { "epoch": 0.1717948717948718, "grad_norm": 0.11007404327392578, "learning_rate": 9.96479463537301e-06, "loss": 1.0251, "step": 67 }, { "epoch": 0.17435897435897435, "grad_norm": 0.1030791848897934, "learning_rate": 9.963491397398239e-06, "loss": 1.0104, "step": 68 }, { "epoch": 0.17692307692307693, "grad_norm": 0.10340573638677597, "learning_rate": 9.962184873949581e-06, "loss": 1.1273, "step": 69 }, { "epoch": 0.1794871794871795, "grad_norm": 0.10667295008897781, "learning_rate": 9.960875052587295e-06, "loss": 1.1031, "step": 70 }, { "epoch": 0.18205128205128204, "grad_norm": 0.10353393852710724, "learning_rate": 9.959561920808762e-06, "loss": 0.9595, "step": 71 }, { "epoch": 0.18461538461538463, "grad_norm": 0.10161738842725754, "learning_rate": 9.95824546604808e-06, "loss": 0.9629, "step": 72 }, { "epoch": 0.18717948717948718, "grad_norm": 0.11324603855609894, "learning_rate": 9.956925675675678e-06, "loss": 1.2039, "step": 73 }, { "epoch": 0.18974358974358974, "grad_norm": 0.11005936563014984, "learning_rate": 9.955602536997886e-06, "loss": 1.0425, "step": 74 }, { "epoch": 0.19230769230769232, "grad_norm": 0.10767950117588043, "learning_rate": 9.954276037256563e-06, "loss": 1.0281, "step": 75 }, { "epoch": 0.19487179487179487, "grad_norm": 0.10422754287719727, "learning_rate": 9.952946163628658e-06, "loss": 1.0155, "step": 76 }, { "epoch": 0.19743589743589743, "grad_norm": 0.10673552006483078, "learning_rate": 9.951612903225807e-06, "loss": 1.184, "step": 77 }, { "epoch": 0.2, "grad_norm": 0.11334969103336334, "learning_rate": 9.950276243093924e-06, "loss": 0.9366, "step": 78 }, { "epoch": 0.20256410256410257, "grad_norm": 0.10220715403556824, "learning_rate": 9.948936170212767e-06, "loss": 1.0855, "step": 79 }, { "epoch": 0.20512820512820512, "grad_norm": 0.10753922909498215, "learning_rate": 9.947592671495527e-06, "loss": 0.946, "step": 80 }, { "epoch": 0.2076923076923077, "grad_norm": 0.1163082867860794, "learning_rate": 9.946245733788397e-06, "loss": 1.0971, "step": 81 }, { "epoch": 0.21025641025641026, "grad_norm": 0.11111017316579819, "learning_rate": 9.944895343870142e-06, "loss": 1.0869, "step": 82 }, { "epoch": 0.2128205128205128, "grad_norm": 0.10907071083784103, "learning_rate": 9.943541488451669e-06, "loss": 0.9786, "step": 83 }, { "epoch": 0.2153846153846154, "grad_norm": 0.10257716476917267, "learning_rate": 9.94218415417559e-06, "loss": 0.9889, "step": 84 }, { "epoch": 0.21794871794871795, "grad_norm": 0.10978135466575623, "learning_rate": 9.94082332761578e-06, "loss": 1.1028, "step": 85 }, { "epoch": 0.2205128205128205, "grad_norm": 0.110615074634552, "learning_rate": 9.939458995276944e-06, "loss": 1.019, "step": 86 }, { "epoch": 0.2230769230769231, "grad_norm": 0.11058582365512848, "learning_rate": 9.938091143594154e-06, "loss": 0.9996, "step": 87 }, { "epoch": 0.22564102564102564, "grad_norm": 0.11037719249725342, "learning_rate": 9.936719758932415e-06, "loss": 1.0338, "step": 88 }, { "epoch": 0.2282051282051282, "grad_norm": 0.10798349976539612, "learning_rate": 9.935344827586207e-06, "loss": 0.9677, "step": 89 }, { "epoch": 0.23076923076923078, "grad_norm": 0.10694784671068192, "learning_rate": 9.933966335779024e-06, "loss": 1.0419, "step": 90 }, { "epoch": 0.23333333333333334, "grad_norm": 0.13677257299423218, "learning_rate": 9.932584269662922e-06, "loss": 1.0015, "step": 91 }, { "epoch": 0.2358974358974359, "grad_norm": 0.11084003746509552, "learning_rate": 9.931198615318045e-06, "loss": 1.0031, "step": 92 }, { "epoch": 0.23846153846153847, "grad_norm": 0.10615186393260956, "learning_rate": 9.929809358752167e-06, "loss": 0.9214, "step": 93 }, { "epoch": 0.24102564102564103, "grad_norm": 0.10620255023241043, "learning_rate": 9.928416485900218e-06, "loss": 0.9185, "step": 94 }, { "epoch": 0.24358974358974358, "grad_norm": 0.11533376574516296, "learning_rate": 9.927019982623805e-06, "loss": 1.0308, "step": 95 }, { "epoch": 0.24615384615384617, "grad_norm": 0.1096138209104538, "learning_rate": 9.925619834710745e-06, "loss": 1.0478, "step": 96 }, { "epoch": 0.24871794871794872, "grad_norm": 0.11876872926950455, "learning_rate": 9.924216027874566e-06, "loss": 1.0602, "step": 97 }, { "epoch": 0.2512820512820513, "grad_norm": 0.10606134682893753, "learning_rate": 9.922808547754035e-06, "loss": 0.901, "step": 98 }, { "epoch": 0.25384615384615383, "grad_norm": 0.11515390872955322, "learning_rate": 9.921397379912666e-06, "loss": 1.0333, "step": 99 }, { "epoch": 0.2564102564102564, "grad_norm": 0.11910593509674072, "learning_rate": 9.919982509838217e-06, "loss": 0.9956, "step": 100 }, { "epoch": 0.258974358974359, "grad_norm": 0.12178193032741547, "learning_rate": 9.918563922942208e-06, "loss": 1.071, "step": 101 }, { "epoch": 0.26153846153846155, "grad_norm": 0.1089189425110817, "learning_rate": 9.917141604559404e-06, "loss": 0.9349, "step": 102 }, { "epoch": 0.2641025641025641, "grad_norm": 0.1138150617480278, "learning_rate": 9.915715539947322e-06, "loss": 1.0169, "step": 103 }, { "epoch": 0.26666666666666666, "grad_norm": 0.1112711951136589, "learning_rate": 9.914285714285713e-06, "loss": 0.9022, "step": 104 }, { "epoch": 0.2692307692307692, "grad_norm": 0.11421187967061996, "learning_rate": 9.912852112676058e-06, "loss": 0.9711, "step": 105 }, { "epoch": 0.2717948717948718, "grad_norm": 0.10921610891819, "learning_rate": 9.911414720141032e-06, "loss": 0.9388, "step": 106 }, { "epoch": 0.2743589743589744, "grad_norm": 0.11643636971712112, "learning_rate": 9.909973521624008e-06, "loss": 1.003, "step": 107 }, { "epoch": 0.27692307692307694, "grad_norm": 0.13560256361961365, "learning_rate": 9.908528501988513e-06, "loss": 0.9955, "step": 108 }, { "epoch": 0.2794871794871795, "grad_norm": 0.11191970109939575, "learning_rate": 9.9070796460177e-06, "loss": 1.0105, "step": 109 }, { "epoch": 0.28205128205128205, "grad_norm": 0.1319538950920105, "learning_rate": 9.905626938413824e-06, "loss": 1.0841, "step": 110 }, { "epoch": 0.2846153846153846, "grad_norm": 0.11922305077314377, "learning_rate": 9.904170363797693e-06, "loss": 0.9614, "step": 111 }, { "epoch": 0.28717948717948716, "grad_norm": 0.11520028859376907, "learning_rate": 9.902709906708132e-06, "loss": 0.9648, "step": 112 }, { "epoch": 0.28974358974358977, "grad_norm": 0.10564184933900833, "learning_rate": 9.901245551601424e-06, "loss": 0.9224, "step": 113 }, { "epoch": 0.2923076923076923, "grad_norm": 0.11402294784784317, "learning_rate": 9.89977728285078e-06, "loss": 1.0839, "step": 114 }, { "epoch": 0.2948717948717949, "grad_norm": 0.1240580752491951, "learning_rate": 9.898305084745763e-06, "loss": 0.9802, "step": 115 }, { "epoch": 0.29743589743589743, "grad_norm": 0.12268956750631332, "learning_rate": 9.896828941491739e-06, "loss": 1.0552, "step": 116 }, { "epoch": 0.3, "grad_norm": 0.11103710532188416, "learning_rate": 9.895348837209303e-06, "loss": 0.9337, "step": 117 }, { "epoch": 0.30256410256410254, "grad_norm": 0.1281978338956833, "learning_rate": 9.893864755933724e-06, "loss": 1.0919, "step": 118 }, { "epoch": 0.30512820512820515, "grad_norm": 0.11921875178813934, "learning_rate": 9.892376681614351e-06, "loss": 1.0019, "step": 119 }, { "epoch": 0.3076923076923077, "grad_norm": 0.11357328295707703, "learning_rate": 9.890884598114054e-06, "loss": 0.9391, "step": 120 }, { "epoch": 0.31025641025641026, "grad_norm": 0.12437216937541962, "learning_rate": 9.889388489208635e-06, "loss": 1.0949, "step": 121 }, { "epoch": 0.3128205128205128, "grad_norm": 0.11032367497682571, "learning_rate": 9.887888338586223e-06, "loss": 1.0712, "step": 122 }, { "epoch": 0.3153846153846154, "grad_norm": 0.10982154309749603, "learning_rate": 9.886384129846709e-06, "loss": 1.0995, "step": 123 }, { "epoch": 0.31794871794871793, "grad_norm": 0.11465884000062943, "learning_rate": 9.88487584650113e-06, "loss": 1.1997, "step": 124 }, { "epoch": 0.32051282051282054, "grad_norm": 0.11689360439777374, "learning_rate": 9.883363471971068e-06, "loss": 0.9555, "step": 125 }, { "epoch": 0.3230769230769231, "grad_norm": 0.12228330969810486, "learning_rate": 9.88184698958805e-06, "loss": 1.0185, "step": 126 }, { "epoch": 0.32564102564102565, "grad_norm": 0.11858666688203812, "learning_rate": 9.88032638259293e-06, "loss": 0.9995, "step": 127 }, { "epoch": 0.3282051282051282, "grad_norm": 0.107363760471344, "learning_rate": 9.87880163413527e-06, "loss": 0.9283, "step": 128 }, { "epoch": 0.33076923076923076, "grad_norm": 0.1286807507276535, "learning_rate": 9.877272727272727e-06, "loss": 0.9635, "step": 129 }, { "epoch": 0.3333333333333333, "grad_norm": 0.13207103312015533, "learning_rate": 9.875739644970415e-06, "loss": 1.0259, "step": 130 }, { "epoch": 0.33589743589743587, "grad_norm": 0.12238481640815735, "learning_rate": 9.874202370100275e-06, "loss": 0.9458, "step": 131 }, { "epoch": 0.3384615384615385, "grad_norm": 0.12218200415372849, "learning_rate": 9.87266088544044e-06, "loss": 0.9856, "step": 132 }, { "epoch": 0.34102564102564104, "grad_norm": 0.192653626203537, "learning_rate": 9.871115173674589e-06, "loss": 1.1864, "step": 133 }, { "epoch": 0.3435897435897436, "grad_norm": 0.12083520740270615, "learning_rate": 9.869565217391306e-06, "loss": 1.0003, "step": 134 }, { "epoch": 0.34615384615384615, "grad_norm": 0.11669037491083145, "learning_rate": 9.86801099908341e-06, "loss": 1.0615, "step": 135 }, { "epoch": 0.3487179487179487, "grad_norm": 0.11430592834949493, "learning_rate": 9.866452501147316e-06, "loss": 0.927, "step": 136 }, { "epoch": 0.35128205128205126, "grad_norm": 0.10570185631513596, "learning_rate": 9.864889705882355e-06, "loss": 0.9244, "step": 137 }, { "epoch": 0.35384615384615387, "grad_norm": 0.12382305413484573, "learning_rate": 9.863322595490108e-06, "loss": 1.0086, "step": 138 }, { "epoch": 0.3564102564102564, "grad_norm": 0.11679980158805847, "learning_rate": 9.861751152073734e-06, "loss": 0.9317, "step": 139 }, { "epoch": 0.358974358974359, "grad_norm": 0.12016775459051132, "learning_rate": 9.860175357637288e-06, "loss": 0.9946, "step": 140 }, { "epoch": 0.36153846153846153, "grad_norm": 0.11385658383369446, "learning_rate": 9.858595194085029e-06, "loss": 0.9507, "step": 141 }, { "epoch": 0.3641025641025641, "grad_norm": 0.11400415003299713, "learning_rate": 9.857010643220733e-06, "loss": 0.977, "step": 142 }, { "epoch": 0.36666666666666664, "grad_norm": 0.12075012922286987, "learning_rate": 9.855421686746988e-06, "loss": 1.1277, "step": 143 }, { "epoch": 0.36923076923076925, "grad_norm": 0.12377439439296722, "learning_rate": 9.853828306264502e-06, "loss": 1.0703, "step": 144 }, { "epoch": 0.3717948717948718, "grad_norm": 0.11683501303195953, "learning_rate": 9.852230483271376e-06, "loss": 0.9773, "step": 145 }, { "epoch": 0.37435897435897436, "grad_norm": 0.11950255930423737, "learning_rate": 9.850628199162401e-06, "loss": 0.9428, "step": 146 }, { "epoch": 0.3769230769230769, "grad_norm": 0.12005724757909775, "learning_rate": 9.849021435228333e-06, "loss": 1.1703, "step": 147 }, { "epoch": 0.37948717948717947, "grad_norm": 0.11797571182250977, "learning_rate": 9.847410172655158e-06, "loss": 0.8719, "step": 148 }, { "epoch": 0.382051282051282, "grad_norm": 0.1225227490067482, "learning_rate": 9.845794392523365e-06, "loss": 1.1311, "step": 149 }, { "epoch": 0.38461538461538464, "grad_norm": 0.12024562805891037, "learning_rate": 9.844174075807208e-06, "loss": 1.1669, "step": 150 }, { "epoch": 0.3871794871794872, "grad_norm": 0.12286081910133362, "learning_rate": 9.842549203373947e-06, "loss": 1.1541, "step": 151 }, { "epoch": 0.38974358974358975, "grad_norm": 0.1257518231868744, "learning_rate": 9.840919755983107e-06, "loss": 0.9313, "step": 152 }, { "epoch": 0.3923076923076923, "grad_norm": 0.12524078786373138, "learning_rate": 9.839285714285715e-06, "loss": 0.904, "step": 153 }, { "epoch": 0.39487179487179486, "grad_norm": 0.1183227151632309, "learning_rate": 9.83764705882353e-06, "loss": 1.0132, "step": 154 }, { "epoch": 0.3974358974358974, "grad_norm": 0.12392973154783249, "learning_rate": 9.836003770028276e-06, "loss": 1.069, "step": 155 }, { "epoch": 0.4, "grad_norm": 0.13140057027339935, "learning_rate": 9.83435582822086e-06, "loss": 1.1211, "step": 156 }, { "epoch": 0.4025641025641026, "grad_norm": 0.1755838245153427, "learning_rate": 9.832703213610588e-06, "loss": 1.053, "step": 157 }, { "epoch": 0.40512820512820513, "grad_norm": 0.12414582073688507, "learning_rate": 9.831045906294368e-06, "loss": 1.0483, "step": 158 }, { "epoch": 0.4076923076923077, "grad_norm": 0.13171876966953278, "learning_rate": 9.829383886255924e-06, "loss": 1.0296, "step": 159 }, { "epoch": 0.41025641025641024, "grad_norm": 0.12738922238349915, "learning_rate": 9.827717133364974e-06, "loss": 1.0102, "step": 160 }, { "epoch": 0.4128205128205128, "grad_norm": 0.1904231309890747, "learning_rate": 9.826045627376427e-06, "loss": 1.0314, "step": 161 }, { "epoch": 0.4153846153846154, "grad_norm": 0.12011483311653137, "learning_rate": 9.824369347929558e-06, "loss": 0.9475, "step": 162 }, { "epoch": 0.41794871794871796, "grad_norm": 0.1304839700460434, "learning_rate": 9.822688274547189e-06, "loss": 1.0456, "step": 163 }, { "epoch": 0.4205128205128205, "grad_norm": 0.131229430437088, "learning_rate": 9.821002386634847e-06, "loss": 1.1589, "step": 164 }, { "epoch": 0.4230769230769231, "grad_norm": 0.12201635539531708, "learning_rate": 9.819311663479923e-06, "loss": 0.966, "step": 165 }, { "epoch": 0.4256410256410256, "grad_norm": 0.12963519990444183, "learning_rate": 9.81761608425084e-06, "loss": 1.0354, "step": 166 }, { "epoch": 0.4282051282051282, "grad_norm": 0.12793965637683868, "learning_rate": 9.815915627996166e-06, "loss": 1.0306, "step": 167 }, { "epoch": 0.4307692307692308, "grad_norm": 0.17451830208301544, "learning_rate": 9.814210273643783e-06, "loss": 1.0694, "step": 168 }, { "epoch": 0.43333333333333335, "grad_norm": 0.14709219336509705, "learning_rate": 9.812500000000001e-06, "loss": 1.1155, "step": 169 }, { "epoch": 0.4358974358974359, "grad_norm": 0.205572709441185, "learning_rate": 9.810784785748676e-06, "loss": 1.0661, "step": 170 }, { "epoch": 0.43846153846153846, "grad_norm": 0.13123241066932678, "learning_rate": 9.809064609450338e-06, "loss": 0.9847, "step": 171 }, { "epoch": 0.441025641025641, "grad_norm": 0.14202538132667542, "learning_rate": 9.807339449541285e-06, "loss": 1.0204, "step": 172 }, { "epoch": 0.44358974358974357, "grad_norm": 0.15248540043830872, "learning_rate": 9.80560928433269e-06, "loss": 0.9468, "step": 173 }, { "epoch": 0.4461538461538462, "grad_norm": 0.296898752450943, "learning_rate": 9.803874092009686e-06, "loss": 1.0747, "step": 174 }, { "epoch": 0.44871794871794873, "grad_norm": 0.12776528298854828, "learning_rate": 9.802133850630456e-06, "loss": 1.0828, "step": 175 }, { "epoch": 0.4512820512820513, "grad_norm": 0.1273936629295349, "learning_rate": 9.800388538125306e-06, "loss": 0.93, "step": 176 }, { "epoch": 0.45384615384615384, "grad_norm": 0.1328604370355606, "learning_rate": 9.79863813229572e-06, "loss": 0.905, "step": 177 }, { "epoch": 0.4564102564102564, "grad_norm": 0.14861007034778595, "learning_rate": 9.796882610813444e-06, "loss": 0.9129, "step": 178 }, { "epoch": 0.45897435897435895, "grad_norm": 0.12911070883274078, "learning_rate": 9.795121951219514e-06, "loss": 0.9161, "step": 179 }, { "epoch": 0.46153846153846156, "grad_norm": 0.12798583507537842, "learning_rate": 9.793356130923302e-06, "loss": 1.015, "step": 180 }, { "epoch": 0.4641025641025641, "grad_norm": 0.14299742877483368, "learning_rate": 9.791585127201565e-06, "loss": 0.9986, "step": 181 }, { "epoch": 0.4666666666666667, "grad_norm": 0.16951002180576324, "learning_rate": 9.789808917197453e-06, "loss": 1.0231, "step": 182 }, { "epoch": 0.46923076923076923, "grad_norm": 0.1452597677707672, "learning_rate": 9.78802747791953e-06, "loss": 0.9805, "step": 183 }, { "epoch": 0.4717948717948718, "grad_norm": 0.12342038750648499, "learning_rate": 9.786240786240787e-06, "loss": 0.9625, "step": 184 }, { "epoch": 0.47435897435897434, "grad_norm": 0.13525085151195526, "learning_rate": 9.784448818897639e-06, "loss": 1.0564, "step": 185 }, { "epoch": 0.47692307692307695, "grad_norm": 0.14499512314796448, "learning_rate": 9.782651552488912e-06, "loss": 1.0676, "step": 186 }, { "epoch": 0.4794871794871795, "grad_norm": 0.13853202760219574, "learning_rate": 9.780848963474828e-06, "loss": 0.9944, "step": 187 }, { "epoch": 0.48205128205128206, "grad_norm": 0.1524648517370224, "learning_rate": 9.779041028175976e-06, "loss": 1.0957, "step": 188 }, { "epoch": 0.4846153846153846, "grad_norm": 0.13356293737888336, "learning_rate": 9.77722772277228e-06, "loss": 1.0397, "step": 189 }, { "epoch": 0.48717948717948717, "grad_norm": 0.1403387039899826, "learning_rate": 9.775409023301933e-06, "loss": 1.1755, "step": 190 }, { "epoch": 0.4897435897435897, "grad_norm": 0.13895130157470703, "learning_rate": 9.773584905660379e-06, "loss": 1.0341, "step": 191 }, { "epoch": 0.49230769230769234, "grad_norm": 0.1415233463048935, "learning_rate": 9.771755345599206e-06, "loss": 1.092, "step": 192 }, { "epoch": 0.4948717948717949, "grad_norm": 0.1509629786014557, "learning_rate": 9.7699203187251e-06, "loss": 0.9921, "step": 193 }, { "epoch": 0.49743589743589745, "grad_norm": 0.13306330144405365, "learning_rate": 9.768079800498753e-06, "loss": 1.0274, "step": 194 }, { "epoch": 0.5, "grad_norm": 0.1483563780784607, "learning_rate": 9.766233766233768e-06, "loss": 0.977, "step": 195 }, { "epoch": 0.5025641025641026, "grad_norm": 0.13634060323238373, "learning_rate": 9.764382191095549e-06, "loss": 0.8818, "step": 196 }, { "epoch": 0.5051282051282051, "grad_norm": 0.13927966356277466, "learning_rate": 9.762525050100202e-06, "loss": 0.9694, "step": 197 }, { "epoch": 0.5076923076923077, "grad_norm": 0.13205285370349884, "learning_rate": 9.760662318113397e-06, "loss": 1.0713, "step": 198 }, { "epoch": 0.5102564102564102, "grad_norm": 0.1272955685853958, "learning_rate": 9.758793969849248e-06, "loss": 0.9304, "step": 199 }, { "epoch": 0.5128205128205128, "grad_norm": 0.14206095039844513, "learning_rate": 9.75691997986915e-06, "loss": 1.0151, "step": 200 }, { "epoch": 0.5153846153846153, "grad_norm": 0.13040238618850708, "learning_rate": 9.755040322580646e-06, "loss": 1.056, "step": 201 }, { "epoch": 0.517948717948718, "grad_norm": 0.13569800555706024, "learning_rate": 9.753154972236246e-06, "loss": 1.0058, "step": 202 }, { "epoch": 0.5205128205128206, "grad_norm": 0.12954074144363403, "learning_rate": 9.751263902932256e-06, "loss": 1.0232, "step": 203 }, { "epoch": 0.5230769230769231, "grad_norm": 0.1352427899837494, "learning_rate": 9.749367088607595e-06, "loss": 1.1409, "step": 204 }, { "epoch": 0.5256410256410257, "grad_norm": 0.13935823738574982, "learning_rate": 9.747464503042597e-06, "loss": 0.9364, "step": 205 }, { "epoch": 0.5282051282051282, "grad_norm": 0.14334161579608917, "learning_rate": 9.745556119857798e-06, "loss": 1.0412, "step": 206 }, { "epoch": 0.5307692307692308, "grad_norm": 0.13454332947731018, "learning_rate": 9.743641912512716e-06, "loss": 1.0093, "step": 207 }, { "epoch": 0.5333333333333333, "grad_norm": 0.1333240121603012, "learning_rate": 9.741721854304638e-06, "loss": 1.0294, "step": 208 }, { "epoch": 0.5358974358974359, "grad_norm": 0.1349434107542038, "learning_rate": 9.739795918367347e-06, "loss": 1.0395, "step": 209 }, { "epoch": 0.5384615384615384, "grad_norm": 0.12995462119579315, "learning_rate": 9.737864077669904e-06, "loss": 1.0675, "step": 210 }, { "epoch": 0.541025641025641, "grad_norm": 0.13981810212135315, "learning_rate": 9.735926305015354e-06, "loss": 1.0434, "step": 211 }, { "epoch": 0.5435897435897435, "grad_norm": 0.13797558844089508, "learning_rate": 9.733982573039467e-06, "loss": 1.1582, "step": 212 }, { "epoch": 0.5461538461538461, "grad_norm": 0.136617973446846, "learning_rate": 9.732032854209446e-06, "loss": 0.9319, "step": 213 }, { "epoch": 0.5487179487179488, "grad_norm": 0.15038198232650757, "learning_rate": 9.730077120822623e-06, "loss": 1.0026, "step": 214 }, { "epoch": 0.5512820512820513, "grad_norm": 0.1529029756784439, "learning_rate": 9.728115345005151e-06, "loss": 1.0784, "step": 215 }, { "epoch": 0.5538461538461539, "grad_norm": 0.13984259963035583, "learning_rate": 9.726147498710677e-06, "loss": 1.1533, "step": 216 }, { "epoch": 0.5564102564102564, "grad_norm": 0.14129801094532013, "learning_rate": 9.724173553719009e-06, "loss": 0.9703, "step": 217 }, { "epoch": 0.558974358974359, "grad_norm": 0.14009319245815277, "learning_rate": 9.722193481634766e-06, "loss": 1.1317, "step": 218 }, { "epoch": 0.5615384615384615, "grad_norm": 0.13649149239063263, "learning_rate": 9.720207253886011e-06, "loss": 1.0083, "step": 219 }, { "epoch": 0.5641025641025641, "grad_norm": 0.13949915766716003, "learning_rate": 9.718214841722885e-06, "loss": 0.9941, "step": 220 }, { "epoch": 0.5666666666666667, "grad_norm": 0.17557266354560852, "learning_rate": 9.716216216216216e-06, "loss": 1.1887, "step": 221 }, { "epoch": 0.5692307692307692, "grad_norm": 0.132981538772583, "learning_rate": 9.714211348256117e-06, "loss": 0.9682, "step": 222 }, { "epoch": 0.5717948717948718, "grad_norm": 0.15944674611091614, "learning_rate": 9.712200208550574e-06, "loss": 0.9442, "step": 223 }, { "epoch": 0.5743589743589743, "grad_norm": 0.15149790048599243, "learning_rate": 9.710182767624022e-06, "loss": 1.1134, "step": 224 }, { "epoch": 0.5769230769230769, "grad_norm": 0.13614985346794128, "learning_rate": 9.7081589958159e-06, "loss": 0.8473, "step": 225 }, { "epoch": 0.5794871794871795, "grad_norm": 0.1307866871356964, "learning_rate": 9.706128863279205e-06, "loss": 0.9158, "step": 226 }, { "epoch": 0.5820512820512821, "grad_norm": 0.14745928347110748, "learning_rate": 9.704092339979015e-06, "loss": 0.9988, "step": 227 }, { "epoch": 0.5846153846153846, "grad_norm": 0.14534030854701996, "learning_rate": 9.702049395691015e-06, "loss": 1.0931, "step": 228 }, { "epoch": 0.5871794871794872, "grad_norm": 0.146283358335495, "learning_rate": 9.7e-06, "loss": 1.2088, "step": 229 }, { "epoch": 0.5897435897435898, "grad_norm": 0.16283774375915527, "learning_rate": 9.697944122298367e-06, "loss": 0.9512, "step": 230 }, { "epoch": 0.5923076923076923, "grad_norm": 0.1303090751171112, "learning_rate": 9.695881731784583e-06, "loss": 0.9181, "step": 231 }, { "epoch": 0.5948717948717949, "grad_norm": 0.14575974643230438, "learning_rate": 9.693812797461662e-06, "loss": 1.0348, "step": 232 }, { "epoch": 0.5974358974358974, "grad_norm": 0.14711220562458038, "learning_rate": 9.691737288135593e-06, "loss": 1.0401, "step": 233 }, { "epoch": 0.6, "grad_norm": 0.14356166124343872, "learning_rate": 9.689655172413794e-06, "loss": 0.9484, "step": 234 }, { "epoch": 0.6025641025641025, "grad_norm": 0.14533978700637817, "learning_rate": 9.687566418703508e-06, "loss": 0.9985, "step": 235 }, { "epoch": 0.6051282051282051, "grad_norm": 0.14926594495773315, "learning_rate": 9.685470995210218e-06, "loss": 1.1273, "step": 236 }, { "epoch": 0.6076923076923076, "grad_norm": 0.15611067414283752, "learning_rate": 9.683368869936036e-06, "loss": 1.0847, "step": 237 }, { "epoch": 0.6102564102564103, "grad_norm": 0.15448501706123352, "learning_rate": 9.681260010678057e-06, "loss": 1.0333, "step": 238 }, { "epoch": 0.6128205128205129, "grad_norm": 0.16903169453144073, "learning_rate": 9.679144385026738e-06, "loss": 0.9972, "step": 239 }, { "epoch": 0.6153846153846154, "grad_norm": 0.14398221671581268, "learning_rate": 9.677021960364222e-06, "loss": 0.9373, "step": 240 }, { "epoch": 0.617948717948718, "grad_norm": 0.16799390316009521, "learning_rate": 9.674892703862662e-06, "loss": 0.9425, "step": 241 }, { "epoch": 0.6205128205128205, "grad_norm": 0.16503410041332245, "learning_rate": 9.672756582482538e-06, "loss": 0.991, "step": 242 }, { "epoch": 0.6230769230769231, "grad_norm": 0.13837389647960663, "learning_rate": 9.670613562970937e-06, "loss": 1.0057, "step": 243 }, { "epoch": 0.6256410256410256, "grad_norm": 0.15482862293720245, "learning_rate": 9.66846361185984e-06, "loss": 1.2884, "step": 244 }, { "epoch": 0.6282051282051282, "grad_norm": 0.17946982383728027, "learning_rate": 9.666306695464364e-06, "loss": 1.0498, "step": 245 }, { "epoch": 0.6307692307692307, "grad_norm": 0.18409568071365356, "learning_rate": 9.664142779881018e-06, "loss": 1.0383, "step": 246 }, { "epoch": 0.6333333333333333, "grad_norm": 0.142312690615654, "learning_rate": 9.661971830985917e-06, "loss": 1.1336, "step": 247 }, { "epoch": 0.6358974358974359, "grad_norm": 0.15140476822853088, "learning_rate": 9.659793814432991e-06, "loss": 1.0985, "step": 248 }, { "epoch": 0.6384615384615384, "grad_norm": 0.1846708357334137, "learning_rate": 9.657608695652173e-06, "loss": 0.9555, "step": 249 }, { "epoch": 0.6410256410256411, "grad_norm": 0.16700689494609833, "learning_rate": 9.655416439847578e-06, "loss": 0.9279, "step": 250 }, { "epoch": 0.6435897435897436, "grad_norm": 0.1423652619123459, "learning_rate": 9.653217011995637e-06, "loss": 1.073, "step": 251 }, { "epoch": 0.6461538461538462, "grad_norm": 0.16811993718147278, "learning_rate": 9.651010376843254e-06, "loss": 1.0924, "step": 252 }, { "epoch": 0.6487179487179487, "grad_norm": 0.15465322136878967, "learning_rate": 9.64879649890591e-06, "loss": 1.078, "step": 253 }, { "epoch": 0.6512820512820513, "grad_norm": 0.1621096432209015, "learning_rate": 9.646575342465754e-06, "loss": 1.0095, "step": 254 }, { "epoch": 0.6538461538461539, "grad_norm": 0.16749772429466248, "learning_rate": 9.644346871569704e-06, "loss": 0.948, "step": 255 }, { "epoch": 0.6564102564102564, "grad_norm": 0.1397644579410553, "learning_rate": 9.64211105002749e-06, "loss": 1.0232, "step": 256 }, { "epoch": 0.658974358974359, "grad_norm": 0.179872065782547, "learning_rate": 9.639867841409692e-06, "loss": 1.0781, "step": 257 }, { "epoch": 0.6615384615384615, "grad_norm": 0.18746939301490784, "learning_rate": 9.63761720904578e-06, "loss": 0.9158, "step": 258 }, { "epoch": 0.6641025641025641, "grad_norm": 0.17777228355407715, "learning_rate": 9.635359116022101e-06, "loss": 1.0611, "step": 259 }, { "epoch": 0.6666666666666666, "grad_norm": 0.15664607286453247, "learning_rate": 9.633093525179857e-06, "loss": 1.0061, "step": 260 }, { "epoch": 0.6692307692307692, "grad_norm": 0.2109006941318512, "learning_rate": 9.630820399113084e-06, "loss": 1.0238, "step": 261 }, { "epoch": 0.6717948717948717, "grad_norm": 0.16149462759494781, "learning_rate": 9.628539700166575e-06, "loss": 1.1312, "step": 262 }, { "epoch": 0.6743589743589744, "grad_norm": 0.13910357654094696, "learning_rate": 9.626251390433817e-06, "loss": 0.9808, "step": 263 }, { "epoch": 0.676923076923077, "grad_norm": 0.15184703469276428, "learning_rate": 9.623955431754875e-06, "loss": 0.9458, "step": 264 }, { "epoch": 0.6794871794871795, "grad_norm": 0.17896021902561188, "learning_rate": 9.621651785714285e-06, "loss": 1.0504, "step": 265 }, { "epoch": 0.6820512820512821, "grad_norm": 0.15594998002052307, "learning_rate": 9.619340413638905e-06, "loss": 1.1288, "step": 266 }, { "epoch": 0.6846153846153846, "grad_norm": 0.14313378930091858, "learning_rate": 9.617021276595746e-06, "loss": 0.9985, "step": 267 }, { "epoch": 0.6871794871794872, "grad_norm": 0.14239932596683502, "learning_rate": 9.614694335389792e-06, "loss": 0.9776, "step": 268 }, { "epoch": 0.6897435897435897, "grad_norm": 0.1516118049621582, "learning_rate": 9.612359550561798e-06, "loss": 1.0086, "step": 269 }, { "epoch": 0.6923076923076923, "grad_norm": 0.1527036875486374, "learning_rate": 9.610016882386046e-06, "loss": 1.1663, "step": 270 }, { "epoch": 0.6948717948717948, "grad_norm": 0.1713275909423828, "learning_rate": 9.607666290868095e-06, "loss": 0.977, "step": 271 }, { "epoch": 0.6974358974358974, "grad_norm": 0.13157938420772552, "learning_rate": 9.60530773574252e-06, "loss": 0.9617, "step": 272 }, { "epoch": 0.7, "grad_norm": 0.17953188717365265, "learning_rate": 9.60294117647059e-06, "loss": 1.0687, "step": 273 }, { "epoch": 0.7025641025641025, "grad_norm": 0.17509308457374573, "learning_rate": 9.60056657223796e-06, "loss": 0.9238, "step": 274 }, { "epoch": 0.7051282051282052, "grad_norm": 0.16777881979942322, "learning_rate": 9.598183881952327e-06, "loss": 1.0203, "step": 275 }, { "epoch": 0.7076923076923077, "grad_norm": 0.1494888812303543, "learning_rate": 9.595793064241049e-06, "loss": 1.0675, "step": 276 }, { "epoch": 0.7102564102564103, "grad_norm": 0.1649765521287918, "learning_rate": 9.593394077448748e-06, "loss": 0.8814, "step": 277 }, { "epoch": 0.7128205128205128, "grad_norm": 0.17049697041511536, "learning_rate": 9.590986879634912e-06, "loss": 0.891, "step": 278 }, { "epoch": 0.7153846153846154, "grad_norm": 0.14463086426258087, "learning_rate": 9.58857142857143e-06, "loss": 0.957, "step": 279 }, { "epoch": 0.717948717948718, "grad_norm": 0.15929310023784637, "learning_rate": 9.586147681740127e-06, "loss": 1.017, "step": 280 }, { "epoch": 0.7205128205128205, "grad_norm": 0.17474216222763062, "learning_rate": 9.583715596330276e-06, "loss": 0.9516, "step": 281 }, { "epoch": 0.7230769230769231, "grad_norm": 0.1831640750169754, "learning_rate": 9.581275129236071e-06, "loss": 1.0512, "step": 282 }, { "epoch": 0.7256410256410256, "grad_norm": 0.1618429571390152, "learning_rate": 9.578826237054085e-06, "loss": 0.9551, "step": 283 }, { "epoch": 0.7282051282051282, "grad_norm": 0.24447672069072723, "learning_rate": 9.576368876080691e-06, "loss": 1.0373, "step": 284 }, { "epoch": 0.7307692307692307, "grad_norm": 0.16472192108631134, "learning_rate": 9.57390300230947e-06, "loss": 0.9911, "step": 285 }, { "epoch": 0.7333333333333333, "grad_norm": 0.1695912629365921, "learning_rate": 9.571428571428573e-06, "loss": 0.9008, "step": 286 }, { "epoch": 0.735897435897436, "grad_norm": 0.1703156977891922, "learning_rate": 9.568945538818077e-06, "loss": 1.0863, "step": 287 }, { "epoch": 0.7384615384615385, "grad_norm": 0.14874251186847687, "learning_rate": 9.566453859547304e-06, "loss": 0.9512, "step": 288 }, { "epoch": 0.7410256410256411, "grad_norm": 0.1689365655183792, "learning_rate": 9.563953488372094e-06, "loss": 1.0063, "step": 289 }, { "epoch": 0.7435897435897436, "grad_norm": 0.17003223299980164, "learning_rate": 9.56144437973209e-06, "loss": 0.9397, "step": 290 }, { "epoch": 0.7461538461538462, "grad_norm": 0.20850569009780884, "learning_rate": 9.55892648774796e-06, "loss": 1.2076, "step": 291 }, { "epoch": 0.7487179487179487, "grad_norm": 0.15689845383167267, "learning_rate": 9.556399766218587e-06, "loss": 1.1031, "step": 292 }, { "epoch": 0.7512820512820513, "grad_norm": 0.160260871052742, "learning_rate": 9.553864168618268e-06, "loss": 1.0275, "step": 293 }, { "epoch": 0.7538461538461538, "grad_norm": 0.1513524055480957, "learning_rate": 9.551319648093842e-06, "loss": 1.008, "step": 294 }, { "epoch": 0.7564102564102564, "grad_norm": 0.17547191679477692, "learning_rate": 9.54876615746181e-06, "loss": 1.0009, "step": 295 }, { "epoch": 0.7589743589743589, "grad_norm": 0.15460693836212158, "learning_rate": 9.546203649205416e-06, "loss": 1.1096, "step": 296 }, { "epoch": 0.7615384615384615, "grad_norm": 0.17146429419517517, "learning_rate": 9.543632075471698e-06, "loss": 0.9587, "step": 297 }, { "epoch": 0.764102564102564, "grad_norm": 0.15998685359954834, "learning_rate": 9.54105138806852e-06, "loss": 0.9264, "step": 298 }, { "epoch": 0.7666666666666667, "grad_norm": 0.17196176946163177, "learning_rate": 9.538461538461538e-06, "loss": 1.0, "step": 299 }, { "epoch": 0.7692307692307693, "grad_norm": 0.18652167916297913, "learning_rate": 9.535862477771191e-06, "loss": 1.1239, "step": 300 }, { "epoch": 0.7717948717948718, "grad_norm": 0.1686553657054901, "learning_rate": 9.533254156769596e-06, "loss": 1.1414, "step": 301 }, { "epoch": 0.7743589743589744, "grad_norm": 0.15988533198833466, "learning_rate": 9.530636525877454e-06, "loss": 0.9027, "step": 302 }, { "epoch": 0.7769230769230769, "grad_norm": 0.15526551008224487, "learning_rate": 9.528009535160905e-06, "loss": 1.0569, "step": 303 }, { "epoch": 0.7794871794871795, "grad_norm": 0.1854647994041443, "learning_rate": 9.52537313432836e-06, "loss": 1.1064, "step": 304 }, { "epoch": 0.782051282051282, "grad_norm": 0.20110487937927246, "learning_rate": 9.522727272727274e-06, "loss": 1.0231, "step": 305 }, { "epoch": 0.7846153846153846, "grad_norm": 0.15321309864521027, "learning_rate": 9.520071899340924e-06, "loss": 1.0294, "step": 306 }, { "epoch": 0.7871794871794872, "grad_norm": 0.1512340009212494, "learning_rate": 9.517406962785115e-06, "loss": 0.9112, "step": 307 }, { "epoch": 0.7897435897435897, "grad_norm": 0.19026243686676025, "learning_rate": 9.514732411304872e-06, "loss": 1.0988, "step": 308 }, { "epoch": 0.7923076923076923, "grad_norm": 0.15860332548618317, "learning_rate": 9.512048192771085e-06, "loss": 0.8795, "step": 309 }, { "epoch": 0.7948717948717948, "grad_norm": 0.2282475382089615, "learning_rate": 9.509354254677129e-06, "loss": 1.0206, "step": 310 }, { "epoch": 0.7974358974358975, "grad_norm": 0.16409388184547424, "learning_rate": 9.50665054413543e-06, "loss": 1.01, "step": 311 }, { "epoch": 0.8, "grad_norm": 0.15974940359592438, "learning_rate": 9.503937007874017e-06, "loss": 0.989, "step": 312 }, { "epoch": 0.8025641025641026, "grad_norm": 0.16357499361038208, "learning_rate": 9.50121359223301e-06, "loss": 0.9589, "step": 313 }, { "epoch": 0.8051282051282052, "grad_norm": 0.1798093467950821, "learning_rate": 9.498480243161095e-06, "loss": 0.9949, "step": 314 }, { "epoch": 0.8076923076923077, "grad_norm": 0.18792827427387238, "learning_rate": 9.495736906211937e-06, "loss": 1.0233, "step": 315 }, { "epoch": 0.8102564102564103, "grad_norm": 0.19793489575386047, "learning_rate": 9.492983526540575e-06, "loss": 1.0853, "step": 316 }, { "epoch": 0.8128205128205128, "grad_norm": 0.148494690656662, "learning_rate": 9.490220048899757e-06, "loss": 1.0836, "step": 317 }, { "epoch": 0.8153846153846154, "grad_norm": 0.20617227256298065, "learning_rate": 9.487446417636253e-06, "loss": 0.9434, "step": 318 }, { "epoch": 0.8179487179487179, "grad_norm": 0.2122315913438797, "learning_rate": 9.484662576687117e-06, "loss": 0.9936, "step": 319 }, { "epoch": 0.8205128205128205, "grad_norm": 0.19928601384162903, "learning_rate": 9.481868469575908e-06, "loss": 0.9267, "step": 320 }, { "epoch": 0.823076923076923, "grad_norm": 0.1821938306093216, "learning_rate": 9.479064039408867e-06, "loss": 0.9761, "step": 321 }, { "epoch": 0.8256410256410256, "grad_norm": 0.1640615314245224, "learning_rate": 9.476249228871069e-06, "loss": 0.9753, "step": 322 }, { "epoch": 0.8282051282051283, "grad_norm": 0.2164408564567566, "learning_rate": 9.4734239802225e-06, "loss": 1.009, "step": 323 }, { "epoch": 0.8307692307692308, "grad_norm": 0.17672689259052277, "learning_rate": 9.470588235294119e-06, "loss": 0.9376, "step": 324 }, { "epoch": 0.8333333333333334, "grad_norm": 0.15482589602470398, "learning_rate": 9.467741935483871e-06, "loss": 1.0459, "step": 325 }, { "epoch": 0.8358974358974359, "grad_norm": 0.1468273401260376, "learning_rate": 9.464885021752642e-06, "loss": 0.9625, "step": 326 }, { "epoch": 0.8384615384615385, "grad_norm": 0.1662525236606598, "learning_rate": 9.462017434620176e-06, "loss": 0.9609, "step": 327 }, { "epoch": 0.841025641025641, "grad_norm": 0.20608198642730713, "learning_rate": 9.45913911416095e-06, "loss": 0.9673, "step": 328 }, { "epoch": 0.8435897435897436, "grad_norm": 0.18384887278079987, "learning_rate": 9.45625e-06, "loss": 1.02, "step": 329 }, { "epoch": 0.8461538461538461, "grad_norm": 0.1636429876089096, "learning_rate": 9.453350031308706e-06, "loss": 1.0665, "step": 330 }, { "epoch": 0.8487179487179487, "grad_norm": 0.16819709539413452, "learning_rate": 9.450439146800503e-06, "loss": 0.9494, "step": 331 }, { "epoch": 0.8512820512820513, "grad_norm": 0.16869591176509857, "learning_rate": 9.447517284726587e-06, "loss": 0.9362, "step": 332 }, { "epoch": 0.8538461538461538, "grad_norm": 0.17681722342967987, "learning_rate": 9.444584382871537e-06, "loss": 0.9596, "step": 333 }, { "epoch": 0.8564102564102564, "grad_norm": 0.1720973253250122, "learning_rate": 9.441640378548898e-06, "loss": 0.9371, "step": 334 }, { "epoch": 0.8589743589743589, "grad_norm": 0.1684177815914154, "learning_rate": 9.438685208596712e-06, "loss": 0.8739, "step": 335 }, { "epoch": 0.8615384615384616, "grad_norm": 0.15152432024478912, "learning_rate": 9.435718809373022e-06, "loss": 1.0489, "step": 336 }, { "epoch": 0.8641025641025641, "grad_norm": 0.16250121593475342, "learning_rate": 9.43274111675127e-06, "loss": 0.9873, "step": 337 }, { "epoch": 0.8666666666666667, "grad_norm": 0.20848555862903595, "learning_rate": 9.429752066115703e-06, "loss": 1.1004, "step": 338 }, { "epoch": 0.8692307692307693, "grad_norm": 0.1785045713186264, "learning_rate": 9.426751592356688e-06, "loss": 1.0217, "step": 339 }, { "epoch": 0.8717948717948718, "grad_norm": 0.14325062930583954, "learning_rate": 9.423739629865986e-06, "loss": 1.0522, "step": 340 }, { "epoch": 0.8743589743589744, "grad_norm": 0.17043401300907135, "learning_rate": 9.420716112531971e-06, "loss": 0.9412, "step": 341 }, { "epoch": 0.8769230769230769, "grad_norm": 0.17380043864250183, "learning_rate": 9.417680973734785e-06, "loss": 1.0975, "step": 342 }, { "epoch": 0.8794871794871795, "grad_norm": 0.15861521661281586, "learning_rate": 9.414634146341465e-06, "loss": 0.9772, "step": 343 }, { "epoch": 0.882051282051282, "grad_norm": 0.17549242079257965, "learning_rate": 9.411575562700965e-06, "loss": 1.0992, "step": 344 }, { "epoch": 0.8846153846153846, "grad_norm": 0.20035730302333832, "learning_rate": 9.408505154639175e-06, "loss": 0.9817, "step": 345 }, { "epoch": 0.8871794871794871, "grad_norm": 0.16777153313159943, "learning_rate": 9.405422853453843e-06, "loss": 0.9846, "step": 346 }, { "epoch": 0.8897435897435897, "grad_norm": 0.17003268003463745, "learning_rate": 9.402328589909445e-06, "loss": 1.022, "step": 347 }, { "epoch": 0.8923076923076924, "grad_norm": 0.17670491337776184, "learning_rate": 9.399222294232017e-06, "loss": 1.1117, "step": 348 }, { "epoch": 0.8948717948717949, "grad_norm": 0.18102656304836273, "learning_rate": 9.396103896103898e-06, "loss": 1.0409, "step": 349 }, { "epoch": 0.8974358974358975, "grad_norm": 0.15800277888774872, "learning_rate": 9.392973324658426e-06, "loss": 0.8649, "step": 350 }, { "epoch": 0.9, "grad_norm": 0.17744143307209015, "learning_rate": 9.389830508474576e-06, "loss": 0.9096, "step": 351 }, { "epoch": 0.9025641025641026, "grad_norm": 0.16629469394683838, "learning_rate": 9.386675375571523e-06, "loss": 1.0208, "step": 352 }, { "epoch": 0.9051282051282051, "grad_norm": 0.15229813754558563, "learning_rate": 9.383507853403142e-06, "loss": 0.9423, "step": 353 }, { "epoch": 0.9076923076923077, "grad_norm": 0.40435972809791565, "learning_rate": 9.38032786885246e-06, "loss": 1.0585, "step": 354 }, { "epoch": 0.9102564102564102, "grad_norm": 0.20711062848567963, "learning_rate": 9.377135348226017e-06, "loss": 1.0715, "step": 355 }, { "epoch": 0.9128205128205128, "grad_norm": 0.20200040936470032, "learning_rate": 9.37393021724819e-06, "loss": 1.0494, "step": 356 }, { "epoch": 0.9153846153846154, "grad_norm": 0.1699497103691101, "learning_rate": 9.370712401055409e-06, "loss": 1.0363, "step": 357 }, { "epoch": 0.9179487179487179, "grad_norm": 0.17281508445739746, "learning_rate": 9.36748182419035e-06, "loss": 1.0238, "step": 358 }, { "epoch": 0.9205128205128205, "grad_norm": 0.18644343316555023, "learning_rate": 9.364238410596028e-06, "loss": 1.0655, "step": 359 }, { "epoch": 0.9230769230769231, "grad_norm": 0.16731494665145874, "learning_rate": 9.360982083609822e-06, "loss": 0.9855, "step": 360 }, { "epoch": 0.9256410256410257, "grad_norm": 0.17364031076431274, "learning_rate": 9.357712765957447e-06, "loss": 0.9505, "step": 361 }, { "epoch": 0.9282051282051282, "grad_norm": 0.1894925981760025, "learning_rate": 9.354430379746837e-06, "loss": 0.9665, "step": 362 }, { "epoch": 0.9307692307692308, "grad_norm": 0.1816585510969162, "learning_rate": 9.35113484646195e-06, "loss": 0.8783, "step": 363 }, { "epoch": 0.9333333333333333, "grad_norm": 0.177452951669693, "learning_rate": 9.347826086956523e-06, "loss": 1.0144, "step": 364 }, { "epoch": 0.9358974358974359, "grad_norm": 0.1650353968143463, "learning_rate": 9.344504021447722e-06, "loss": 0.9883, "step": 365 }, { "epoch": 0.9384615384615385, "grad_norm": 0.155875101685524, "learning_rate": 9.341168569509738e-06, "loss": 1.0195, "step": 366 }, { "epoch": 0.941025641025641, "grad_norm": 0.19764171540737152, "learning_rate": 9.337819650067296e-06, "loss": 1.0666, "step": 367 }, { "epoch": 0.9435897435897436, "grad_norm": 0.15158161520957947, "learning_rate": 9.334457181389078e-06, "loss": 1.1297, "step": 368 }, { "epoch": 0.9461538461538461, "grad_norm": 0.16357675194740295, "learning_rate": 9.331081081081083e-06, "loss": 0.9474, "step": 369 }, { "epoch": 0.9487179487179487, "grad_norm": 0.17813360691070557, "learning_rate": 9.327691266079892e-06, "loss": 1.0161, "step": 370 }, { "epoch": 0.9512820512820512, "grad_norm": 0.15733803808689117, "learning_rate": 9.324287652645861e-06, "loss": 0.8616, "step": 371 }, { "epoch": 0.9538461538461539, "grad_norm": 0.16512970626354218, "learning_rate": 9.320870156356221e-06, "loss": 0.906, "step": 372 }, { "epoch": 0.9564102564102565, "grad_norm": 0.16653649508953094, "learning_rate": 9.317438692098092e-06, "loss": 0.9861, "step": 373 }, { "epoch": 0.958974358974359, "grad_norm": 0.15374256670475006, "learning_rate": 9.313993174061434e-06, "loss": 0.9238, "step": 374 }, { "epoch": 0.9615384615384616, "grad_norm": 0.1622532606124878, "learning_rate": 9.310533515731875e-06, "loss": 0.9759, "step": 375 }, { "epoch": 0.9641025641025641, "grad_norm": 0.19126266241073608, "learning_rate": 9.307059629883482e-06, "loss": 0.9745, "step": 376 }, { "epoch": 0.9666666666666667, "grad_norm": 0.1595565676689148, "learning_rate": 9.303571428571428e-06, "loss": 0.9289, "step": 377 }, { "epoch": 0.9692307692307692, "grad_norm": 0.17021501064300537, "learning_rate": 9.30006882312457e-06, "loss": 1.0675, "step": 378 }, { "epoch": 0.9717948717948718, "grad_norm": 0.17533089220523834, "learning_rate": 9.296551724137932e-06, "loss": 1.0388, "step": 379 }, { "epoch": 0.9743589743589743, "grad_norm": 0.16750235855579376, "learning_rate": 9.2930200414651e-06, "loss": 1.0436, "step": 380 }, { "epoch": 0.9769230769230769, "grad_norm": 0.15664179623126984, "learning_rate": 9.289473684210525e-06, "loss": 0.977, "step": 381 }, { "epoch": 0.9794871794871794, "grad_norm": 0.16151364147663116, "learning_rate": 9.285912560721721e-06, "loss": 1.0423, "step": 382 }, { "epoch": 0.982051282051282, "grad_norm": 0.16488024592399597, "learning_rate": 9.282336578581363e-06, "loss": 0.9958, "step": 383 }, { "epoch": 0.9846153846153847, "grad_norm": 0.2102440744638443, "learning_rate": 9.278745644599303e-06, "loss": 0.9997, "step": 384 }, { "epoch": 0.9871794871794872, "grad_norm": 0.18937990069389343, "learning_rate": 9.275139664804471e-06, "loss": 1.0116, "step": 385 }, { "epoch": 0.9897435897435898, "grad_norm": 0.17054639756679535, "learning_rate": 9.27151854443667e-06, "loss": 0.9257, "step": 386 }, { "epoch": 0.9923076923076923, "grad_norm": 0.16185376048088074, "learning_rate": 9.26788218793829e-06, "loss": 0.8911, "step": 387 }, { "epoch": 0.9948717948717949, "grad_norm": 0.18112531304359436, "learning_rate": 9.26423049894589e-06, "loss": 1.0187, "step": 388 }, { "epoch": 0.9974358974358974, "grad_norm": 0.17097817361354828, "learning_rate": 9.26056338028169e-06, "loss": 0.9781, "step": 389 }, { "epoch": 1.0, "grad_norm": 0.33144575357437134, "learning_rate": 9.256880733944955e-06, "loss": 0.8241, "step": 390 }, { "epoch": 1.0025641025641026, "grad_norm": 0.1937527358531952, "learning_rate": 9.253182461103253e-06, "loss": 1.0755, "step": 391 }, { "epoch": 1.005128205128205, "grad_norm": 0.15882588922977448, "learning_rate": 9.24946846208363e-06, "loss": 0.9799, "step": 392 }, { "epoch": 1.0076923076923077, "grad_norm": 0.16072547435760498, "learning_rate": 9.245738636363637e-06, "loss": 0.9872, "step": 393 }, { "epoch": 1.0102564102564102, "grad_norm": 0.15416628122329712, "learning_rate": 9.241992882562277e-06, "loss": 1.0529, "step": 394 }, { "epoch": 1.0128205128205128, "grad_norm": 0.16685126721858978, "learning_rate": 9.238231098430814e-06, "loss": 0.8965, "step": 395 }, { "epoch": 1.0153846153846153, "grad_norm": 0.17164798080921173, "learning_rate": 9.23445318084346e-06, "loss": 1.0228, "step": 396 }, { "epoch": 1.0179487179487179, "grad_norm": 0.1579882949590683, "learning_rate": 9.230659025787966e-06, "loss": 0.9462, "step": 397 }, { "epoch": 1.0205128205128204, "grad_norm": 0.16084755957126617, "learning_rate": 9.226848528356067e-06, "loss": 1.0358, "step": 398 }, { "epoch": 1.023076923076923, "grad_norm": 0.1533387154340744, "learning_rate": 9.223021582733813e-06, "loss": 0.9388, "step": 399 }, { "epoch": 1.0256410256410255, "grad_norm": 0.16010062396526337, "learning_rate": 9.21917808219178e-06, "loss": 1.053, "step": 400 }, { "epoch": 1.028205128205128, "grad_norm": 0.16239330172538757, "learning_rate": 9.215317919075145e-06, "loss": 0.9812, "step": 401 }, { "epoch": 1.0307692307692307, "grad_norm": 0.1662273406982422, "learning_rate": 9.211440984793628e-06, "loss": 1.0265, "step": 402 }, { "epoch": 1.0333333333333334, "grad_norm": 0.18184800446033478, "learning_rate": 9.20754716981132e-06, "loss": 0.8951, "step": 403 }, { "epoch": 1.035897435897436, "grad_norm": 0.17101332545280457, "learning_rate": 9.203636363636365e-06, "loss": 1.0448, "step": 404 }, { "epoch": 1.0384615384615385, "grad_norm": 0.16575555503368378, "learning_rate": 9.199708454810497e-06, "loss": 0.974, "step": 405 }, { "epoch": 1.041025641025641, "grad_norm": 0.18276239931583405, "learning_rate": 9.195763330898466e-06, "loss": 1.0457, "step": 406 }, { "epoch": 1.0435897435897437, "grad_norm": 0.1637968271970749, "learning_rate": 9.191800878477306e-06, "loss": 0.9338, "step": 407 }, { "epoch": 1.0461538461538462, "grad_norm": 0.20241133868694305, "learning_rate": 9.18782098312546e-06, "loss": 1.0354, "step": 408 }, { "epoch": 1.0487179487179488, "grad_norm": 0.161885067820549, "learning_rate": 9.183823529411765e-06, "loss": 0.9205, "step": 409 }, { "epoch": 1.0512820512820513, "grad_norm": 0.14989648759365082, "learning_rate": 9.179808400884306e-06, "loss": 0.9291, "step": 410 }, { "epoch": 1.0538461538461539, "grad_norm": 0.1757401078939438, "learning_rate": 9.175775480059083e-06, "loss": 0.9478, "step": 411 }, { "epoch": 1.0564102564102564, "grad_norm": 0.1715121567249298, "learning_rate": 9.171724648408586e-06, "loss": 0.9509, "step": 412 }, { "epoch": 1.058974358974359, "grad_norm": 0.1794794499874115, "learning_rate": 9.167655786350149e-06, "loss": 0.9052, "step": 413 }, { "epoch": 1.0615384615384615, "grad_norm": 0.1767176240682602, "learning_rate": 9.163568773234201e-06, "loss": 0.8936, "step": 414 }, { "epoch": 1.064102564102564, "grad_norm": 0.16628186404705048, "learning_rate": 9.15946348733234e-06, "loss": 0.9326, "step": 415 }, { "epoch": 1.0666666666666667, "grad_norm": 0.21831132471561432, "learning_rate": 9.155339805825244e-06, "loss": 1.0548, "step": 416 }, { "epoch": 1.0692307692307692, "grad_norm": 0.19534535706043243, "learning_rate": 9.15119760479042e-06, "loss": 0.9584, "step": 417 }, { "epoch": 1.0717948717948718, "grad_norm": 0.16996845602989197, "learning_rate": 9.147036759189797e-06, "loss": 0.9573, "step": 418 }, { "epoch": 1.0743589743589743, "grad_norm": 0.16845408082008362, "learning_rate": 9.142857142857144e-06, "loss": 0.8904, "step": 419 }, { "epoch": 1.0769230769230769, "grad_norm": 0.1751023381948471, "learning_rate": 9.138658628485306e-06, "loss": 1.0049, "step": 420 }, { "epoch": 1.0794871794871794, "grad_norm": 0.17877094447612762, "learning_rate": 9.134441087613294e-06, "loss": 1.0011, "step": 421 }, { "epoch": 1.082051282051282, "grad_norm": 0.17528527975082397, "learning_rate": 9.130204390613173e-06, "loss": 1.0129, "step": 422 }, { "epoch": 1.0846153846153845, "grad_norm": 0.18258242309093475, "learning_rate": 9.125948406676782e-06, "loss": 1.098, "step": 423 }, { "epoch": 1.087179487179487, "grad_norm": 0.19751989841461182, "learning_rate": 9.121673003802282e-06, "loss": 0.9406, "step": 424 }, { "epoch": 1.0897435897435896, "grad_norm": 0.1751803457736969, "learning_rate": 9.117378048780488e-06, "loss": 0.9812, "step": 425 }, { "epoch": 1.0923076923076924, "grad_norm": 0.16526196897029877, "learning_rate": 9.113063407181055e-06, "loss": 0.9926, "step": 426 }, { "epoch": 1.094871794871795, "grad_norm": 0.17794327437877655, "learning_rate": 9.108728943338438e-06, "loss": 0.8838, "step": 427 }, { "epoch": 1.0974358974358975, "grad_norm": 0.16672289371490479, "learning_rate": 9.104374520337684e-06, "loss": 0.9296, "step": 428 }, { "epoch": 1.1, "grad_norm": 0.1750965267419815, "learning_rate": 9.100000000000001e-06, "loss": 0.9608, "step": 429 }, { "epoch": 1.1025641025641026, "grad_norm": 0.31471797823905945, "learning_rate": 9.095605242868158e-06, "loss": 1.113, "step": 430 }, { "epoch": 1.1051282051282052, "grad_norm": 0.2636878192424774, "learning_rate": 9.091190108191653e-06, "loss": 1.0369, "step": 431 }, { "epoch": 1.1076923076923078, "grad_norm": 0.16675381362438202, "learning_rate": 9.086754453911697e-06, "loss": 0.9234, "step": 432 }, { "epoch": 1.1102564102564103, "grad_norm": 0.19578181207180023, "learning_rate": 9.082298136645965e-06, "loss": 0.8915, "step": 433 }, { "epoch": 1.1128205128205129, "grad_norm": 0.15622036159038544, "learning_rate": 9.077821011673151e-06, "loss": 0.834, "step": 434 }, { "epoch": 1.1153846153846154, "grad_norm": 0.1723235547542572, "learning_rate": 9.073322932917318e-06, "loss": 1.1672, "step": 435 }, { "epoch": 1.117948717948718, "grad_norm": 0.16886036098003387, "learning_rate": 9.068803752931978e-06, "loss": 1.0947, "step": 436 }, { "epoch": 1.1205128205128205, "grad_norm": 0.1680499017238617, "learning_rate": 9.064263322884012e-06, "loss": 1.0609, "step": 437 }, { "epoch": 1.123076923076923, "grad_norm": 0.16996805369853973, "learning_rate": 9.059701492537314e-06, "loss": 0.9502, "step": 438 }, { "epoch": 1.1256410256410256, "grad_norm": 0.17708763480186462, "learning_rate": 9.05511811023622e-06, "loss": 0.9767, "step": 439 }, { "epoch": 1.1282051282051282, "grad_norm": 0.19685539603233337, "learning_rate": 9.050513022888715e-06, "loss": 0.957, "step": 440 }, { "epoch": 1.1307692307692307, "grad_norm": 0.18247175216674805, "learning_rate": 9.045886075949368e-06, "loss": 0.9098, "step": 441 }, { "epoch": 1.1333333333333333, "grad_norm": 0.1733454316854477, "learning_rate": 9.041237113402062e-06, "loss": 0.9007, "step": 442 }, { "epoch": 1.1358974358974359, "grad_norm": 0.19381284713745117, "learning_rate": 9.036565977742447e-06, "loss": 0.8715, "step": 443 }, { "epoch": 1.1384615384615384, "grad_norm": 0.19639398157596588, "learning_rate": 9.031872509960161e-06, "loss": 1.0139, "step": 444 }, { "epoch": 1.141025641025641, "grad_norm": 0.17254269123077393, "learning_rate": 9.027156549520768e-06, "loss": 1.0033, "step": 445 }, { "epoch": 1.1435897435897435, "grad_norm": 0.16717708110809326, "learning_rate": 9.022417934347479e-06, "loss": 0.9119, "step": 446 }, { "epoch": 1.146153846153846, "grad_norm": 0.19381357729434967, "learning_rate": 9.01765650080257e-06, "loss": 1.058, "step": 447 }, { "epoch": 1.1487179487179486, "grad_norm": 0.1698828637599945, "learning_rate": 9.012872083668544e-06, "loss": 1.0112, "step": 448 }, { "epoch": 1.1512820512820512, "grad_norm": 0.1623694747686386, "learning_rate": 9.008064516129034e-06, "loss": 0.9848, "step": 449 }, { "epoch": 1.1538461538461537, "grad_norm": 0.22092927992343903, "learning_rate": 9.003233629749395e-06, "loss": 0.9878, "step": 450 }, { "epoch": 1.1564102564102563, "grad_norm": 0.17867566645145416, "learning_rate": 8.998379254457052e-06, "loss": 1.0226, "step": 451 }, { "epoch": 1.1589743589743589, "grad_norm": 0.1954340934753418, "learning_rate": 8.993501218521528e-06, "loss": 1.044, "step": 452 }, { "epoch": 1.1615384615384616, "grad_norm": 0.1862252950668335, "learning_rate": 8.988599348534203e-06, "loss": 0.9865, "step": 453 }, { "epoch": 1.1641025641025642, "grad_norm": 0.2021472156047821, "learning_rate": 8.983673469387756e-06, "loss": 0.9522, "step": 454 }, { "epoch": 1.1666666666666667, "grad_norm": 0.179282084107399, "learning_rate": 8.97872340425532e-06, "loss": 0.9702, "step": 455 }, { "epoch": 1.1692307692307693, "grad_norm": 0.16460926830768585, "learning_rate": 8.973748974569319e-06, "loss": 1.031, "step": 456 }, { "epoch": 1.1717948717948719, "grad_norm": 0.17866218090057373, "learning_rate": 8.968750000000001e-06, "loss": 1.0303, "step": 457 }, { "epoch": 1.1743589743589744, "grad_norm": 0.21034833788871765, "learning_rate": 8.963726298433634e-06, "loss": 0.9513, "step": 458 }, { "epoch": 1.176923076923077, "grad_norm": 0.24306456744670868, "learning_rate": 8.958677685950415e-06, "loss": 0.9149, "step": 459 }, { "epoch": 1.1794871794871795, "grad_norm": 0.21025407314300537, "learning_rate": 8.95360397680199e-06, "loss": 1.097, "step": 460 }, { "epoch": 1.182051282051282, "grad_norm": 0.18221627175807953, "learning_rate": 8.948504983388704e-06, "loss": 0.9259, "step": 461 }, { "epoch": 1.1846153846153846, "grad_norm": 0.18507330119609833, "learning_rate": 8.94338051623647e-06, "loss": 1.075, "step": 462 }, { "epoch": 1.1871794871794872, "grad_norm": 0.20780031383037567, "learning_rate": 8.938230383973288e-06, "loss": 1.0373, "step": 463 }, { "epoch": 1.1897435897435897, "grad_norm": 0.18843470513820648, "learning_rate": 8.93305439330544e-06, "loss": 0.9142, "step": 464 }, { "epoch": 1.1923076923076923, "grad_norm": 0.20828303694725037, "learning_rate": 8.92785234899329e-06, "loss": 0.9692, "step": 465 }, { "epoch": 1.1948717948717948, "grad_norm": 0.15796665847301483, "learning_rate": 8.922624053826745e-06, "loss": 0.936, "step": 466 }, { "epoch": 1.1974358974358974, "grad_norm": 0.17404665052890778, "learning_rate": 8.917369308600336e-06, "loss": 0.9466, "step": 467 }, { "epoch": 1.2, "grad_norm": 0.20071259140968323, "learning_rate": 8.912087912087912e-06, "loss": 0.9609, "step": 468 }, { "epoch": 1.2025641025641025, "grad_norm": 0.21828190982341766, "learning_rate": 8.90677966101695e-06, "loss": 0.9867, "step": 469 }, { "epoch": 1.205128205128205, "grad_norm": 0.17222779989242554, "learning_rate": 8.901444350042482e-06, "loss": 1.055, "step": 470 }, { "epoch": 1.2076923076923076, "grad_norm": 0.17729543149471283, "learning_rate": 8.896081771720614e-06, "loss": 1.0886, "step": 471 }, { "epoch": 1.2102564102564102, "grad_norm": 0.17077124118804932, "learning_rate": 8.89069171648164e-06, "loss": 0.9318, "step": 472 }, { "epoch": 1.2128205128205127, "grad_norm": 0.1938466727733612, "learning_rate": 8.88527397260274e-06, "loss": 0.9199, "step": 473 }, { "epoch": 1.2153846153846155, "grad_norm": 0.21834906935691833, "learning_rate": 8.879828326180258e-06, "loss": 1.0522, "step": 474 }, { "epoch": 1.217948717948718, "grad_norm": 0.18741706013679504, "learning_rate": 8.87435456110155e-06, "loss": 0.9653, "step": 475 }, { "epoch": 1.2205128205128206, "grad_norm": 0.18282394111156464, "learning_rate": 8.868852459016393e-06, "loss": 1.0237, "step": 476 }, { "epoch": 1.2230769230769232, "grad_norm": 0.17378900945186615, "learning_rate": 8.86332179930796e-06, "loss": 1.055, "step": 477 }, { "epoch": 1.2256410256410257, "grad_norm": 0.21999238431453705, "learning_rate": 8.857762359063314e-06, "loss": 0.97, "step": 478 }, { "epoch": 1.2282051282051283, "grad_norm": 0.18467235565185547, "learning_rate": 8.852173913043478e-06, "loss": 1.0498, "step": 479 }, { "epoch": 1.2307692307692308, "grad_norm": 0.16720303893089294, "learning_rate": 8.846556233653009e-06, "loss": 0.9951, "step": 480 }, { "epoch": 1.2333333333333334, "grad_norm": 0.16237983107566833, "learning_rate": 8.840909090909092e-06, "loss": 1.0361, "step": 481 }, { "epoch": 1.235897435897436, "grad_norm": 0.15937431156635284, "learning_rate": 8.835232252410167e-06, "loss": 0.9251, "step": 482 }, { "epoch": 1.2384615384615385, "grad_norm": 0.17901954054832458, "learning_rate": 8.829525483304043e-06, "loss": 1.0279, "step": 483 }, { "epoch": 1.241025641025641, "grad_norm": 0.21782898902893066, "learning_rate": 8.823788546255506e-06, "loss": 1.0042, "step": 484 }, { "epoch": 1.2435897435897436, "grad_norm": 0.172428160905838, "learning_rate": 8.818021201413429e-06, "loss": 1.069, "step": 485 }, { "epoch": 1.2461538461538462, "grad_norm": 0.1770864725112915, "learning_rate": 8.812223206377328e-06, "loss": 1.0291, "step": 486 }, { "epoch": 1.2487179487179487, "grad_norm": 0.1773838996887207, "learning_rate": 8.80639431616341e-06, "loss": 1.0329, "step": 487 }, { "epoch": 1.2512820512820513, "grad_norm": 0.20391307771205902, "learning_rate": 8.80053428317008e-06, "loss": 0.9828, "step": 488 }, { "epoch": 1.2538461538461538, "grad_norm": 0.16360723972320557, "learning_rate": 8.794642857142858e-06, "loss": 0.9086, "step": 489 }, { "epoch": 1.2564102564102564, "grad_norm": 0.21982532739639282, "learning_rate": 8.788719785138765e-06, "loss": 1.097, "step": 490 }, { "epoch": 1.258974358974359, "grad_norm": 0.19650976359844208, "learning_rate": 8.782764811490128e-06, "loss": 1.014, "step": 491 }, { "epoch": 1.2615384615384615, "grad_norm": 0.1792370080947876, "learning_rate": 8.776777677767778e-06, "loss": 0.9578, "step": 492 }, { "epoch": 1.264102564102564, "grad_norm": 0.2550472617149353, "learning_rate": 8.770758122743683e-06, "loss": 0.985, "step": 493 }, { "epoch": 1.2666666666666666, "grad_norm": 0.18580225110054016, "learning_rate": 8.764705882352942e-06, "loss": 0.9796, "step": 494 }, { "epoch": 1.2692307692307692, "grad_norm": 0.2560383975505829, "learning_rate": 8.758620689655173e-06, "loss": 1.0939, "step": 495 }, { "epoch": 1.2717948717948717, "grad_norm": 0.18652617931365967, "learning_rate": 8.752502274795269e-06, "loss": 0.9389, "step": 496 }, { "epoch": 1.2743589743589743, "grad_norm": 0.170726478099823, "learning_rate": 8.746350364963505e-06, "loss": 1.0236, "step": 497 }, { "epoch": 1.2769230769230768, "grad_norm": 0.20897836983203888, "learning_rate": 8.740164684354986e-06, "loss": 0.9533, "step": 498 }, { "epoch": 1.2794871794871794, "grad_norm": 0.19295988976955414, "learning_rate": 8.73394495412844e-06, "loss": 0.9425, "step": 499 }, { "epoch": 1.282051282051282, "grad_norm": 0.20471826195716858, "learning_rate": 8.727690892364306e-06, "loss": 0.9558, "step": 500 }, { "epoch": 1.2846153846153845, "grad_norm": 0.18632952868938446, "learning_rate": 8.72140221402214e-06, "loss": 0.9701, "step": 501 }, { "epoch": 1.287179487179487, "grad_norm": 0.20619980990886688, "learning_rate": 8.715078630897317e-06, "loss": 0.9442, "step": 502 }, { "epoch": 1.2897435897435898, "grad_norm": 0.16518618166446686, "learning_rate": 8.708719851576993e-06, "loss": 1.0207, "step": 503 }, { "epoch": 1.2923076923076924, "grad_norm": 0.1911863535642624, "learning_rate": 8.70232558139535e-06, "loss": 1.0024, "step": 504 }, { "epoch": 1.294871794871795, "grad_norm": 1.0833367109298706, "learning_rate": 8.695895522388062e-06, "loss": 0.9605, "step": 505 }, { "epoch": 1.2974358974358975, "grad_norm": 0.18326595425605774, "learning_rate": 8.689429373246025e-06, "loss": 0.9348, "step": 506 }, { "epoch": 1.3, "grad_norm": 0.18599998950958252, "learning_rate": 8.682926829268294e-06, "loss": 1.2229, "step": 507 }, { "epoch": 1.3025641025641026, "grad_norm": 0.19638995826244354, "learning_rate": 8.676387582314206e-06, "loss": 0.9343, "step": 508 }, { "epoch": 1.3051282051282052, "grad_norm": 0.1773020327091217, "learning_rate": 8.669811320754717e-06, "loss": 0.9836, "step": 509 }, { "epoch": 1.3076923076923077, "grad_norm": 0.19725504517555237, "learning_rate": 8.663197729422896e-06, "loss": 0.9532, "step": 510 }, { "epoch": 1.3102564102564103, "grad_norm": 0.18866512179374695, "learning_rate": 8.656546489563568e-06, "loss": 0.9729, "step": 511 }, { "epoch": 1.3128205128205128, "grad_norm": 0.18089522421360016, "learning_rate": 8.649857278782113e-06, "loss": 1.0848, "step": 512 }, { "epoch": 1.3153846153846154, "grad_norm": 0.18652409315109253, "learning_rate": 8.643129770992367e-06, "loss": 0.9687, "step": 513 }, { "epoch": 1.317948717948718, "grad_norm": 0.19303199648857117, "learning_rate": 8.636363636363635e-06, "loss": 1.0083, "step": 514 }, { "epoch": 1.3205128205128205, "grad_norm": 0.207601860165596, "learning_rate": 8.629558541266796e-06, "loss": 0.9553, "step": 515 }, { "epoch": 1.323076923076923, "grad_norm": 0.18684937059879303, "learning_rate": 8.622714148219442e-06, "loss": 1.0599, "step": 516 }, { "epoch": 1.3256410256410256, "grad_norm": 0.1821713149547577, "learning_rate": 8.615830115830118e-06, "loss": 1.0457, "step": 517 }, { "epoch": 1.3282051282051281, "grad_norm": 0.1726110726594925, "learning_rate": 8.608906098741529e-06, "loss": 0.8477, "step": 518 }, { "epoch": 1.3307692307692307, "grad_norm": 0.17926542460918427, "learning_rate": 8.601941747572816e-06, "loss": 0.9752, "step": 519 }, { "epoch": 1.3333333333333333, "grad_norm": 0.1952233761548996, "learning_rate": 8.59493670886076e-06, "loss": 0.9636, "step": 520 }, { "epoch": 1.3358974358974358, "grad_norm": 0.1748773604631424, "learning_rate": 8.587890625000001e-06, "loss": 0.9876, "step": 521 }, { "epoch": 1.3384615384615386, "grad_norm": 0.1747111827135086, "learning_rate": 8.580803134182175e-06, "loss": 0.9405, "step": 522 }, { "epoch": 1.3410256410256411, "grad_norm": 0.18903814256191254, "learning_rate": 8.573673870333989e-06, "loss": 1.0787, "step": 523 }, { "epoch": 1.3435897435897437, "grad_norm": 0.16885128617286682, "learning_rate": 8.566502463054187e-06, "loss": 0.9353, "step": 524 }, { "epoch": 1.3461538461538463, "grad_norm": 0.19254456460475922, "learning_rate": 8.559288537549409e-06, "loss": 0.9973, "step": 525 }, { "epoch": 1.3487179487179488, "grad_norm": 0.18546819686889648, "learning_rate": 8.552031714568882e-06, "loss": 0.9529, "step": 526 }, { "epoch": 1.3512820512820514, "grad_norm": 0.17594410479068756, "learning_rate": 8.544731610337974e-06, "loss": 1.0129, "step": 527 }, { "epoch": 1.353846153846154, "grad_norm": 0.19100527465343475, "learning_rate": 8.537387836490528e-06, "loss": 1.043, "step": 528 }, { "epoch": 1.3564102564102565, "grad_norm": 0.18892578780651093, "learning_rate": 8.53e-06, "loss": 0.953, "step": 529 }, { "epoch": 1.358974358974359, "grad_norm": 0.1750698834657669, "learning_rate": 8.522567703109327e-06, "loss": 1.0616, "step": 530 }, { "epoch": 1.3615384615384616, "grad_norm": 0.17712536454200745, "learning_rate": 8.515090543259558e-06, "loss": 0.8927, "step": 531 }, { "epoch": 1.3641025641025641, "grad_norm": 0.1855439990758896, "learning_rate": 8.507568113017155e-06, "loss": 0.995, "step": 532 }, { "epoch": 1.3666666666666667, "grad_norm": 0.17967894673347473, "learning_rate": 8.5e-06, "loss": 0.8988, "step": 533 }, { "epoch": 1.3692307692307693, "grad_norm": 0.167103573679924, "learning_rate": 8.492385786802031e-06, "loss": 0.9392, "step": 534 }, { "epoch": 1.3717948717948718, "grad_norm": 0.1761719435453415, "learning_rate": 8.484725050916498e-06, "loss": 0.9431, "step": 535 }, { "epoch": 1.3743589743589744, "grad_norm": 0.19669947028160095, "learning_rate": 8.477017364657814e-06, "loss": 0.956, "step": 536 }, { "epoch": 1.376923076923077, "grad_norm": 0.17305508255958557, "learning_rate": 8.469262295081969e-06, "loss": 1.0234, "step": 537 }, { "epoch": 1.3794871794871795, "grad_norm": 0.18830622732639313, "learning_rate": 8.461459403905446e-06, "loss": 0.9498, "step": 538 }, { "epoch": 1.382051282051282, "grad_norm": 0.20369920134544373, "learning_rate": 8.453608247422681e-06, "loss": 1.1387, "step": 539 }, { "epoch": 1.3846153846153846, "grad_norm": 0.18848799169063568, "learning_rate": 8.445708376421923e-06, "loss": 0.9122, "step": 540 }, { "epoch": 1.3871794871794871, "grad_norm": 0.17956501245498657, "learning_rate": 8.437759336099585e-06, "loss": 0.9424, "step": 541 }, { "epoch": 1.3897435897435897, "grad_norm": 0.19759565591812134, "learning_rate": 8.429760665972945e-06, "loss": 0.9336, "step": 542 }, { "epoch": 1.3923076923076922, "grad_norm": 0.20953185856342316, "learning_rate": 8.421711899791232e-06, "loss": 0.9995, "step": 543 }, { "epoch": 1.3948717948717948, "grad_norm": 0.1723688542842865, "learning_rate": 8.413612565445026e-06, "loss": 0.9328, "step": 544 }, { "epoch": 1.3974358974358974, "grad_norm": 0.16942423582077026, "learning_rate": 8.405462184873949e-06, "loss": 0.9179, "step": 545 }, { "epoch": 1.4, "grad_norm": 0.16917023062705994, "learning_rate": 8.397260273972604e-06, "loss": 1.0107, "step": 546 }, { "epoch": 1.4025641025641025, "grad_norm": 0.18283595144748688, "learning_rate": 8.389006342494715e-06, "loss": 0.8784, "step": 547 }, { "epoch": 1.405128205128205, "grad_norm": 0.17370331287384033, "learning_rate": 8.380699893955462e-06, "loss": 1.1566, "step": 548 }, { "epoch": 1.4076923076923076, "grad_norm": 0.21643978357315063, "learning_rate": 8.372340425531915e-06, "loss": 1.045, "step": 549 }, { "epoch": 1.4102564102564101, "grad_norm": 0.18621404469013214, "learning_rate": 8.36392742796158e-06, "loss": 0.9533, "step": 550 }, { "epoch": 1.4128205128205127, "grad_norm": 0.1949056088924408, "learning_rate": 8.355460385438972e-06, "loss": 1.0161, "step": 551 }, { "epoch": 1.4153846153846155, "grad_norm": 0.1903102844953537, "learning_rate": 8.346938775510205e-06, "loss": 1.047, "step": 552 }, { "epoch": 1.417948717948718, "grad_norm": 0.17839354276657104, "learning_rate": 8.338362068965518e-06, "loss": 0.9366, "step": 553 }, { "epoch": 1.4205128205128206, "grad_norm": 0.18962249159812927, "learning_rate": 8.32972972972973e-06, "loss": 0.9484, "step": 554 }, { "epoch": 1.4230769230769231, "grad_norm": 0.17600049078464508, "learning_rate": 8.321041214750544e-06, "loss": 0.9337, "step": 555 }, { "epoch": 1.4256410256410257, "grad_norm": 0.20685282349586487, "learning_rate": 8.312295973884657e-06, "loss": 0.9831, "step": 556 }, { "epoch": 1.4282051282051282, "grad_norm": 0.20490646362304688, "learning_rate": 8.303493449781661e-06, "loss": 1.0035, "step": 557 }, { "epoch": 1.4307692307692308, "grad_norm": 0.17430691421031952, "learning_rate": 8.294633077765607e-06, "loss": 0.8622, "step": 558 }, { "epoch": 1.4333333333333333, "grad_norm": 0.2322288304567337, "learning_rate": 8.285714285714285e-06, "loss": 0.9546, "step": 559 }, { "epoch": 1.435897435897436, "grad_norm": 0.19194380939006805, "learning_rate": 8.276736493936054e-06, "loss": 1.0656, "step": 560 }, { "epoch": 1.4384615384615385, "grad_norm": 0.1931033879518509, "learning_rate": 8.267699115044248e-06, "loss": 0.9399, "step": 561 }, { "epoch": 1.441025641025641, "grad_norm": 0.184538334608078, "learning_rate": 8.25860155382908e-06, "loss": 0.9948, "step": 562 }, { "epoch": 1.4435897435897436, "grad_norm": 0.19109323620796204, "learning_rate": 8.249443207126949e-06, "loss": 0.9874, "step": 563 }, { "epoch": 1.4461538461538461, "grad_norm": 0.1646609902381897, "learning_rate": 8.24022346368715e-06, "loss": 0.9157, "step": 564 }, { "epoch": 1.4487179487179487, "grad_norm": 0.19419412314891815, "learning_rate": 8.230941704035874e-06, "loss": 0.9856, "step": 565 }, { "epoch": 1.4512820512820512, "grad_norm": 0.18451392650604248, "learning_rate": 8.221597300337459e-06, "loss": 0.9303, "step": 566 }, { "epoch": 1.4538461538461538, "grad_norm": 0.20760126411914825, "learning_rate": 8.212189616252821e-06, "loss": 1.1956, "step": 567 }, { "epoch": 1.4564102564102563, "grad_norm": 0.2049357295036316, "learning_rate": 8.202718006795016e-06, "loss": 0.9186, "step": 568 }, { "epoch": 1.458974358974359, "grad_norm": 0.18056929111480713, "learning_rate": 8.193181818181819e-06, "loss": 0.9874, "step": 569 }, { "epoch": 1.4615384615384617, "grad_norm": 0.2029920220375061, "learning_rate": 8.18358038768529e-06, "loss": 1.0627, "step": 570 }, { "epoch": 1.4641025641025642, "grad_norm": 0.1772759109735489, "learning_rate": 8.173913043478263e-06, "loss": 0.9109, "step": 571 }, { "epoch": 1.4666666666666668, "grad_norm": 0.2249906063079834, "learning_rate": 8.164179104477612e-06, "loss": 0.92, "step": 572 }, { "epoch": 1.4692307692307693, "grad_norm": 0.1960502713918686, "learning_rate": 8.154377880184333e-06, "loss": 1.07, "step": 573 }, { "epoch": 1.471794871794872, "grad_norm": 0.17915907502174377, "learning_rate": 8.14450867052023e-06, "loss": 1.0765, "step": 574 }, { "epoch": 1.4743589743589745, "grad_norm": 0.214991495013237, "learning_rate": 8.134570765661253e-06, "loss": 0.9856, "step": 575 }, { "epoch": 1.476923076923077, "grad_norm": 0.19141773879528046, "learning_rate": 8.124563445867288e-06, "loss": 1.0069, "step": 576 }, { "epoch": 1.4794871794871796, "grad_norm": 0.18558935821056366, "learning_rate": 8.114485981308412e-06, "loss": 0.9061, "step": 577 }, { "epoch": 1.4820512820512821, "grad_norm": 0.2104201316833496, "learning_rate": 8.104337631887457e-06, "loss": 0.9805, "step": 578 }, { "epoch": 1.4846153846153847, "grad_norm": 0.18049705028533936, "learning_rate": 8.094117647058823e-06, "loss": 0.9658, "step": 579 }, { "epoch": 1.4871794871794872, "grad_norm": 0.22525040805339813, "learning_rate": 8.083825265643448e-06, "loss": 1.0575, "step": 580 }, { "epoch": 1.4897435897435898, "grad_norm": 0.20596688985824585, "learning_rate": 8.07345971563981e-06, "loss": 0.8823, "step": 581 }, { "epoch": 1.4923076923076923, "grad_norm": 0.24059003591537476, "learning_rate": 8.063020214030916e-06, "loss": 0.9827, "step": 582 }, { "epoch": 1.494871794871795, "grad_norm": 0.18533092737197876, "learning_rate": 8.052505966587113e-06, "loss": 1.0123, "step": 583 }, { "epoch": 1.4974358974358974, "grad_norm": 0.20136979222297668, "learning_rate": 8.04191616766467e-06, "loss": 1.0111, "step": 584 }, { "epoch": 1.5, "grad_norm": 0.19839423894882202, "learning_rate": 8.03125e-06, "loss": 1.0131, "step": 585 }, { "epoch": 1.5025641025641026, "grad_norm": 0.18837936222553253, "learning_rate": 8.020506634499398e-06, "loss": 1.166, "step": 586 }, { "epoch": 1.505128205128205, "grad_norm": 0.18904945254325867, "learning_rate": 8.009685230024214e-06, "loss": 0.9491, "step": 587 }, { "epoch": 1.5076923076923077, "grad_norm": 0.17879720032215118, "learning_rate": 7.998784933171326e-06, "loss": 1.0575, "step": 588 }, { "epoch": 1.5102564102564102, "grad_norm": 0.19607414305210114, "learning_rate": 7.98780487804878e-06, "loss": 1.0261, "step": 589 }, { "epoch": 1.5128205128205128, "grad_norm": 0.23364603519439697, "learning_rate": 7.97674418604651e-06, "loss": 0.951, "step": 590 }, { "epoch": 1.5153846153846153, "grad_norm": 0.20051056146621704, "learning_rate": 7.965601965601966e-06, "loss": 1.2431, "step": 591 }, { "epoch": 1.5179487179487179, "grad_norm": 0.19472134113311768, "learning_rate": 7.954377311960544e-06, "loss": 1.0066, "step": 592 }, { "epoch": 1.5205128205128204, "grad_norm": 0.21720701456069946, "learning_rate": 7.943069306930693e-06, "loss": 0.9888, "step": 593 }, { "epoch": 1.523076923076923, "grad_norm": 0.18797412514686584, "learning_rate": 7.93167701863354e-06, "loss": 0.9282, "step": 594 }, { "epoch": 1.5256410256410255, "grad_norm": 0.18229195475578308, "learning_rate": 7.920199501246883e-06, "loss": 1.1075, "step": 595 }, { "epoch": 1.528205128205128, "grad_norm": 0.20988033711910248, "learning_rate": 7.90863579474343e-06, "loss": 0.9834, "step": 596 }, { "epoch": 1.5307692307692307, "grad_norm": 0.18902920186519623, "learning_rate": 7.896984924623117e-06, "loss": 1.0285, "step": 597 }, { "epoch": 1.5333333333333332, "grad_norm": 0.2365204393863678, "learning_rate": 7.885245901639344e-06, "loss": 1.0288, "step": 598 }, { "epoch": 1.5358974358974358, "grad_norm": 0.18257446587085724, "learning_rate": 7.873417721518988e-06, "loss": 1.0293, "step": 599 }, { "epoch": 1.5384615384615383, "grad_norm": 0.17291095852851868, "learning_rate": 7.861499364675985e-06, "loss": 0.9189, "step": 600 }, { "epoch": 1.5410256410256409, "grad_norm": 0.1902029812335968, "learning_rate": 7.849489795918368e-06, "loss": 0.8937, "step": 601 }, { "epoch": 1.5435897435897434, "grad_norm": 0.17989574372768402, "learning_rate": 7.837387964148529e-06, "loss": 1.0091, "step": 602 }, { "epoch": 1.546153846153846, "grad_norm": 0.19586458802223206, "learning_rate": 7.825192802056556e-06, "loss": 0.9092, "step": 603 }, { "epoch": 1.5487179487179488, "grad_norm": 0.2133467197418213, "learning_rate": 7.812903225806452e-06, "loss": 0.957, "step": 604 }, { "epoch": 1.5512820512820513, "grad_norm": 0.22505982220172882, "learning_rate": 7.800518134715025e-06, "loss": 1.0118, "step": 605 }, { "epoch": 1.5538461538461539, "grad_norm": 0.20532438158988953, "learning_rate": 7.788036410923278e-06, "loss": 0.9181, "step": 606 }, { "epoch": 1.5564102564102564, "grad_norm": 0.17881132662296295, "learning_rate": 7.775456919060053e-06, "loss": 1.0308, "step": 607 }, { "epoch": 1.558974358974359, "grad_norm": 0.21090662479400635, "learning_rate": 7.762778505897773e-06, "loss": 1.1082, "step": 608 }, { "epoch": 1.5615384615384615, "grad_norm": 0.223121777176857, "learning_rate": 7.75e-06, "loss": 0.9349, "step": 609 }, { "epoch": 1.564102564102564, "grad_norm": 0.20706158876419067, "learning_rate": 7.737120211360633e-06, "loss": 1.0369, "step": 610 }, { "epoch": 1.5666666666666667, "grad_norm": 0.19180113077163696, "learning_rate": 7.724137931034483e-06, "loss": 0.917, "step": 611 }, { "epoch": 1.5692307692307692, "grad_norm": 0.19626112282276154, "learning_rate": 7.711051930758989e-06, "loss": 1.0926, "step": 612 }, { "epoch": 1.5717948717948718, "grad_norm": 0.19783137738704681, "learning_rate": 7.697860962566846e-06, "loss": 0.9433, "step": 613 }, { "epoch": 1.5743589743589743, "grad_norm": 0.21266983449459076, "learning_rate": 7.684563758389262e-06, "loss": 1.0266, "step": 614 }, { "epoch": 1.5769230769230769, "grad_norm": 0.1945042610168457, "learning_rate": 7.671159029649595e-06, "loss": 0.9966, "step": 615 }, { "epoch": 1.5794871794871796, "grad_norm": 0.1982981264591217, "learning_rate": 7.657645466847092e-06, "loss": 0.9904, "step": 616 }, { "epoch": 1.5820512820512822, "grad_norm": 0.1927499920129776, "learning_rate": 7.644021739130436e-06, "loss": 1.0763, "step": 617 }, { "epoch": 1.5846153846153848, "grad_norm": 0.19995129108428955, "learning_rate": 7.630286493860846e-06, "loss": 0.9884, "step": 618 }, { "epoch": 1.5871794871794873, "grad_norm": 0.17647652328014374, "learning_rate": 7.616438356164383e-06, "loss": 1.0011, "step": 619 }, { "epoch": 1.5897435897435899, "grad_norm": 0.1947464793920517, "learning_rate": 7.6024759284731776e-06, "loss": 1.0027, "step": 620 }, { "epoch": 1.5923076923076924, "grad_norm": 0.20255906879901886, "learning_rate": 7.5883977900552484e-06, "loss": 0.9758, "step": 621 }, { "epoch": 1.594871794871795, "grad_norm": 0.21405860781669617, "learning_rate": 7.574202496532593e-06, "loss": 1.1207, "step": 622 }, { "epoch": 1.5974358974358975, "grad_norm": 0.16839265823364258, "learning_rate": 7.559888579387188e-06, "loss": 0.9939, "step": 623 }, { "epoch": 1.6, "grad_norm": 0.19284895062446594, "learning_rate": 7.545454545454545e-06, "loss": 1.0519, "step": 624 }, { "epoch": 1.6025641025641026, "grad_norm": 0.1863621473312378, "learning_rate": 7.5308988764044946e-06, "loss": 0.9174, "step": 625 }, { "epoch": 1.6051282051282052, "grad_norm": 0.2013963907957077, "learning_rate": 7.516220028208745e-06, "loss": 0.9832, "step": 626 }, { "epoch": 1.6076923076923078, "grad_norm": 0.18340826034545898, "learning_rate": 7.501416430594901e-06, "loss": 0.8789, "step": 627 }, { "epoch": 1.6102564102564103, "grad_norm": 0.1774785965681076, "learning_rate": 7.486486486486487e-06, "loss": 1.1, "step": 628 }, { "epoch": 1.6128205128205129, "grad_norm": 0.18885089457035065, "learning_rate": 7.471428571428571e-06, "loss": 0.9079, "step": 629 }, { "epoch": 1.6153846153846154, "grad_norm": 0.19248345494270325, "learning_rate": 7.456241032998566e-06, "loss": 0.9162, "step": 630 }, { "epoch": 1.617948717948718, "grad_norm": 0.17770878970623016, "learning_rate": 7.440922190201729e-06, "loss": 0.9829, "step": 631 }, { "epoch": 1.6205128205128205, "grad_norm": 0.19071798026561737, "learning_rate": 7.42547033285094e-06, "loss": 0.9304, "step": 632 }, { "epoch": 1.623076923076923, "grad_norm": 0.1921025514602661, "learning_rate": 7.409883720930233e-06, "loss": 0.9004, "step": 633 }, { "epoch": 1.6256410256410256, "grad_norm": 0.21452969312667847, "learning_rate": 7.394160583941606e-06, "loss": 0.9945, "step": 634 }, { "epoch": 1.6282051282051282, "grad_norm": 0.16074183583259583, "learning_rate": 7.378299120234605e-06, "loss": 0.925, "step": 635 }, { "epoch": 1.6307692307692307, "grad_norm": 0.1816839724779129, "learning_rate": 7.362297496318113e-06, "loss": 0.9164, "step": 636 }, { "epoch": 1.6333333333333333, "grad_norm": 0.19317786395549774, "learning_rate": 7.346153846153847e-06, "loss": 0.943, "step": 637 }, { "epoch": 1.6358974358974359, "grad_norm": 0.21708151698112488, "learning_rate": 7.329866270430907e-06, "loss": 0.9576, "step": 638 }, { "epoch": 1.6384615384615384, "grad_norm": 0.200921893119812, "learning_rate": 7.313432835820895e-06, "loss": 0.9782, "step": 639 }, { "epoch": 1.641025641025641, "grad_norm": 0.1886773556470871, "learning_rate": 7.2968515742128935e-06, "loss": 0.989, "step": 640 }, { "epoch": 1.6435897435897435, "grad_norm": 0.19396939873695374, "learning_rate": 7.280120481927711e-06, "loss": 1.026, "step": 641 }, { "epoch": 1.646153846153846, "grad_norm": 0.21198135614395142, "learning_rate": 7.263237518910742e-06, "loss": 1.1656, "step": 642 }, { "epoch": 1.6487179487179486, "grad_norm": 0.22808434069156647, "learning_rate": 7.246200607902737e-06, "loss": 1.0588, "step": 643 }, { "epoch": 1.6512820512820512, "grad_norm": 0.19930703938007355, "learning_rate": 7.229007633587788e-06, "loss": 0.9826, "step": 644 }, { "epoch": 1.6538461538461537, "grad_norm": 0.188712477684021, "learning_rate": 7.211656441717792e-06, "loss": 1.0296, "step": 645 }, { "epoch": 1.6564102564102563, "grad_norm": 0.2129139006137848, "learning_rate": 7.194144838212635e-06, "loss": 1.1525, "step": 646 }, { "epoch": 1.6589743589743589, "grad_norm": 0.20264121890068054, "learning_rate": 7.176470588235295e-06, "loss": 0.9398, "step": 647 }, { "epoch": 1.6615384615384614, "grad_norm": 0.18227992951869965, "learning_rate": 7.1586314152410585e-06, "loss": 1.0267, "step": 648 }, { "epoch": 1.664102564102564, "grad_norm": 0.1936773806810379, "learning_rate": 7.140625e-06, "loss": 0.8565, "step": 649 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1949433982372284, "learning_rate": 7.122448979591836e-06, "loss": 1.0448, "step": 650 }, { "epoch": 1.669230769230769, "grad_norm": 0.19935406744480133, "learning_rate": 7.104100946372239e-06, "loss": 0.9539, "step": 651 }, { "epoch": 1.6717948717948716, "grad_norm": 0.2263110727071762, "learning_rate": 7.085578446909666e-06, "loss": 0.9838, "step": 652 }, { "epoch": 1.6743589743589744, "grad_norm": 0.2188168615102768, "learning_rate": 7.06687898089172e-06, "loss": 0.9958, "step": 653 }, { "epoch": 1.676923076923077, "grad_norm": 0.2003227025270462, "learning_rate": 7.048e-06, "loss": 1.0545, "step": 654 }, { "epoch": 1.6794871794871795, "grad_norm": 0.18214313685894012, "learning_rate": 7.028938906752412e-06, "loss": 0.9917, "step": 655 }, { "epoch": 1.682051282051282, "grad_norm": 0.1975235790014267, "learning_rate": 7.009693053311792e-06, "loss": 1.0347, "step": 656 }, { "epoch": 1.6846153846153846, "grad_norm": 0.2402488738298416, "learning_rate": 6.990259740259741e-06, "loss": 0.8716, "step": 657 }, { "epoch": 1.6871794871794872, "grad_norm": 0.20426467061042786, "learning_rate": 6.970636215334422e-06, "loss": 0.9865, "step": 658 }, { "epoch": 1.6897435897435897, "grad_norm": 0.22642633318901062, "learning_rate": 6.950819672131147e-06, "loss": 0.9125, "step": 659 }, { "epoch": 1.6923076923076923, "grad_norm": 0.23084644973278046, "learning_rate": 6.930807248764415e-06, "loss": 0.9856, "step": 660 }, { "epoch": 1.6948717948717948, "grad_norm": 0.22062422335147858, "learning_rate": 6.910596026490067e-06, "loss": 1.055, "step": 661 }, { "epoch": 1.6974358974358974, "grad_norm": 0.19819729030132294, "learning_rate": 6.89018302828619e-06, "loss": 1.0142, "step": 662 }, { "epoch": 1.7, "grad_norm": 0.2273205816745758, "learning_rate": 6.869565217391305e-06, "loss": 0.9948, "step": 663 }, { "epoch": 1.7025641025641025, "grad_norm": 0.2149330973625183, "learning_rate": 6.848739495798319e-06, "loss": 0.9062, "step": 664 }, { "epoch": 1.7051282051282053, "grad_norm": 0.17445141077041626, "learning_rate": 6.827702702702703e-06, "loss": 1.0166, "step": 665 }, { "epoch": 1.7076923076923078, "grad_norm": 0.21047592163085938, "learning_rate": 6.806451612903226e-06, "loss": 1.0253, "step": 666 }, { "epoch": 1.7102564102564104, "grad_norm": 0.19377169013023376, "learning_rate": 6.784982935153583e-06, "loss": 0.9729, "step": 667 }, { "epoch": 1.712820512820513, "grad_norm": 0.18540802597999573, "learning_rate": 6.763293310463122e-06, "loss": 0.8732, "step": 668 }, { "epoch": 1.7153846153846155, "grad_norm": 0.2677832841873169, "learning_rate": 6.741379310344829e-06, "loss": 1.0237, "step": 669 }, { "epoch": 1.717948717948718, "grad_norm": 0.20734448730945587, "learning_rate": 6.719237435008666e-06, "loss": 1.0527, "step": 670 }, { "epoch": 1.7205128205128206, "grad_norm": 0.1992000937461853, "learning_rate": 6.696864111498258e-06, "loss": 0.9951, "step": 671 }, { "epoch": 1.7230769230769232, "grad_norm": 0.19159814715385437, "learning_rate": 6.6742556917688265e-06, "loss": 1.1233, "step": 672 }, { "epoch": 1.7256410256410257, "grad_norm": 0.2154679298400879, "learning_rate": 6.651408450704226e-06, "loss": 1.0262, "step": 673 }, { "epoch": 1.7282051282051283, "grad_norm": 0.1996496319770813, "learning_rate": 6.628318584070796e-06, "loss": 1.0542, "step": 674 }, { "epoch": 1.7307692307692308, "grad_norm": 0.18427924811840057, "learning_rate": 6.604982206405694e-06, "loss": 0.8912, "step": 675 }, { "epoch": 1.7333333333333334, "grad_norm": 0.1896672397851944, "learning_rate": 6.58139534883721e-06, "loss": 1.1042, "step": 676 }, { "epoch": 1.735897435897436, "grad_norm": 0.2349502146244049, "learning_rate": 6.557553956834534e-06, "loss": 1.0842, "step": 677 }, { "epoch": 1.7384615384615385, "grad_norm": 0.21734175086021423, "learning_rate": 6.533453887884268e-06, "loss": 1.0086, "step": 678 }, { "epoch": 1.741025641025641, "grad_norm": 0.19185325503349304, "learning_rate": 6.5090909090909095e-06, "loss": 0.9509, "step": 679 }, { "epoch": 1.7435897435897436, "grad_norm": 0.18834951519966125, "learning_rate": 6.484460694698354e-06, "loss": 1.0333, "step": 680 }, { "epoch": 1.7461538461538462, "grad_norm": 0.19962508976459503, "learning_rate": 6.459558823529412e-06, "loss": 1.0062, "step": 681 }, { "epoch": 1.7487179487179487, "grad_norm": 0.1968788206577301, "learning_rate": 6.434380776340111e-06, "loss": 0.9652, "step": 682 }, { "epoch": 1.7512820512820513, "grad_norm": 0.19386903941631317, "learning_rate": 6.408921933085502e-06, "loss": 0.9253, "step": 683 }, { "epoch": 1.7538461538461538, "grad_norm": 0.18816019594669342, "learning_rate": 6.38317757009346e-06, "loss": 0.998, "step": 684 }, { "epoch": 1.7564102564102564, "grad_norm": 0.19842685759067535, "learning_rate": 6.357142857142856e-06, "loss": 1.0057, "step": 685 }, { "epoch": 1.758974358974359, "grad_norm": 0.19914638996124268, "learning_rate": 6.330812854442344e-06, "loss": 0.9429, "step": 686 }, { "epoch": 1.7615384615384615, "grad_norm": 0.1913568377494812, "learning_rate": 6.304182509505703e-06, "loss": 0.8765, "step": 687 }, { "epoch": 1.764102564102564, "grad_norm": 0.20887283980846405, "learning_rate": 6.277246653919694e-06, "loss": 1.0437, "step": 688 }, { "epoch": 1.7666666666666666, "grad_norm": 0.1883188635110855, "learning_rate": 6.25e-06, "loss": 0.9215, "step": 689 }, { "epoch": 1.7692307692307692, "grad_norm": 0.1730821281671524, "learning_rate": 6.222437137330755e-06, "loss": 1.0435, "step": 690 }, { "epoch": 1.7717948717948717, "grad_norm": 0.18366935849189758, "learning_rate": 6.194552529182879e-06, "loss": 1.0434, "step": 691 }, { "epoch": 1.7743589743589743, "grad_norm": 0.16954126954078674, "learning_rate": 6.166340508806262e-06, "loss": 0.9655, "step": 692 }, { "epoch": 1.7769230769230768, "grad_norm": 0.20240214467048645, "learning_rate": 6.137795275590551e-06, "loss": 1.0522, "step": 693 }, { "epoch": 1.7794871794871794, "grad_norm": 0.1842651218175888, "learning_rate": 6.1089108910891094e-06, "loss": 0.9502, "step": 694 }, { "epoch": 1.782051282051282, "grad_norm": 0.19008156657218933, "learning_rate": 6.079681274900399e-06, "loss": 0.9194, "step": 695 }, { "epoch": 1.7846153846153845, "grad_norm": 0.18955452740192413, "learning_rate": 6.050100200400802e-06, "loss": 0.9153, "step": 696 }, { "epoch": 1.787179487179487, "grad_norm": 0.18745142221450806, "learning_rate": 6.020161290322582e-06, "loss": 1.0557, "step": 697 }, { "epoch": 1.7897435897435896, "grad_norm": 0.19175171852111816, "learning_rate": 5.9898580121703855e-06, "loss": 1.0186, "step": 698 }, { "epoch": 1.7923076923076922, "grad_norm": 0.21348226070404053, "learning_rate": 5.9591836734693876e-06, "loss": 1.1486, "step": 699 }, { "epoch": 1.7948717948717947, "grad_norm": 0.19678117334842682, "learning_rate": 5.928131416837782e-06, "loss": 1.0151, "step": 700 }, { "epoch": 1.7974358974358975, "grad_norm": 0.17783664166927338, "learning_rate": 5.896694214876034e-06, "loss": 0.9591, "step": 701 }, { "epoch": 1.8, "grad_norm": 0.21038049459457397, "learning_rate": 5.8648648648648655e-06, "loss": 1.0256, "step": 702 }, { "epoch": 1.8025641025641026, "grad_norm": 0.2131882607936859, "learning_rate": 5.832635983263598e-06, "loss": 1.0602, "step": 703 }, { "epoch": 1.8051282051282052, "grad_norm": 0.2531805634498596, "learning_rate": 5.8e-06, "loss": 0.9224, "step": 704 }, { "epoch": 1.8076923076923077, "grad_norm": 0.20389708876609802, "learning_rate": 5.766949152542372e-06, "loss": 0.926, "step": 705 }, { "epoch": 1.8102564102564103, "grad_norm": 0.19052807986736298, "learning_rate": 5.733475479744137e-06, "loss": 1.0813, "step": 706 }, { "epoch": 1.8128205128205128, "grad_norm": 0.23358896374702454, "learning_rate": 5.6995708154506445e-06, "loss": 1.1185, "step": 707 }, { "epoch": 1.8153846153846154, "grad_norm": 0.19410401582717896, "learning_rate": 5.6652267818574515e-06, "loss": 0.9472, "step": 708 }, { "epoch": 1.817948717948718, "grad_norm": 0.20657074451446533, "learning_rate": 5.630434782608696e-06, "loss": 0.9792, "step": 709 }, { "epoch": 1.8205128205128205, "grad_norm": 0.19862636923789978, "learning_rate": 5.5951859956236334e-06, "loss": 1.0459, "step": 710 }, { "epoch": 1.823076923076923, "grad_norm": 0.19714003801345825, "learning_rate": 5.559471365638766e-06, "loss": 1.0864, "step": 711 }, { "epoch": 1.8256410256410256, "grad_norm": 0.2103991061449051, "learning_rate": 5.523281596452329e-06, "loss": 1.0025, "step": 712 }, { "epoch": 1.8282051282051284, "grad_norm": 0.21029628813266754, "learning_rate": 5.486607142857143e-06, "loss": 1.0526, "step": 713 }, { "epoch": 1.830769230769231, "grad_norm": 0.207773357629776, "learning_rate": 5.4494382022471915e-06, "loss": 0.9971, "step": 714 }, { "epoch": 1.8333333333333335, "grad_norm": 0.21237727999687195, "learning_rate": 5.411764705882353e-06, "loss": 0.9528, "step": 715 }, { "epoch": 1.835897435897436, "grad_norm": 0.1775677651166916, "learning_rate": 5.373576309794989e-06, "loss": 0.8888, "step": 716 }, { "epoch": 1.8384615384615386, "grad_norm": 0.21109408140182495, "learning_rate": 5.3348623853211015e-06, "loss": 0.9526, "step": 717 }, { "epoch": 1.8410256410256411, "grad_norm": 0.20082655549049377, "learning_rate": 5.295612009237876e-06, "loss": 0.989, "step": 718 }, { "epoch": 1.8435897435897437, "grad_norm": 0.18796475231647491, "learning_rate": 5.255813953488372e-06, "loss": 1.1235, "step": 719 }, { "epoch": 1.8461538461538463, "grad_norm": 0.19870947301387787, "learning_rate": 5.215456674473068e-06, "loss": 0.9476, "step": 720 }, { "epoch": 1.8487179487179488, "grad_norm": 0.20163416862487793, "learning_rate": 5.174528301886793e-06, "loss": 1.0574, "step": 721 }, { "epoch": 1.8512820512820514, "grad_norm": 0.1803264170885086, "learning_rate": 5.133016627078385e-06, "loss": 0.9504, "step": 722 }, { "epoch": 1.853846153846154, "grad_norm": 0.19215236604213715, "learning_rate": 5.090909090909091e-06, "loss": 1.1705, "step": 723 }, { "epoch": 1.8564102564102565, "grad_norm": 0.2053728848695755, "learning_rate": 5.048192771084337e-06, "loss": 1.0616, "step": 724 }, { "epoch": 1.858974358974359, "grad_norm": 0.18856436014175415, "learning_rate": 5.004854368932039e-06, "loss": 1.1064, "step": 725 }, { "epoch": 1.8615384615384616, "grad_norm": 0.22481724619865417, "learning_rate": 4.960880195599021e-06, "loss": 0.9446, "step": 726 }, { "epoch": 1.8641025641025641, "grad_norm": 0.19489426910877228, "learning_rate": 4.916256157635469e-06, "loss": 0.9482, "step": 727 }, { "epoch": 1.8666666666666667, "grad_norm": 0.18249572813510895, "learning_rate": 4.870967741935484e-06, "loss": 0.9447, "step": 728 }, { "epoch": 1.8692307692307693, "grad_norm": 0.1984269618988037, "learning_rate": 4.825e-06, "loss": 0.8603, "step": 729 }, { "epoch": 1.8717948717948718, "grad_norm": 0.1892971694469452, "learning_rate": 4.778337531486147e-06, "loss": 0.9162, "step": 730 }, { "epoch": 1.8743589743589744, "grad_norm": 0.19022035598754883, "learning_rate": 4.7309644670050755e-06, "loss": 0.915, "step": 731 }, { "epoch": 1.876923076923077, "grad_norm": 0.20524592697620392, "learning_rate": 4.6828644501278775e-06, "loss": 0.9754, "step": 732 }, { "epoch": 1.8794871794871795, "grad_norm": 0.19411511719226837, "learning_rate": 4.6340206185567015e-06, "loss": 0.9368, "step": 733 }, { "epoch": 1.882051282051282, "grad_norm": 0.19343458116054535, "learning_rate": 4.584415584415584e-06, "loss": 1.0132, "step": 734 }, { "epoch": 1.8846153846153846, "grad_norm": 0.197899729013443, "learning_rate": 4.534031413612565e-06, "loss": 1.0597, "step": 735 }, { "epoch": 1.8871794871794871, "grad_norm": 0.22261539101600647, "learning_rate": 4.482849604221636e-06, "loss": 0.9934, "step": 736 }, { "epoch": 1.8897435897435897, "grad_norm": 0.21835994720458984, "learning_rate": 4.430851063829788e-06, "loss": 0.9954, "step": 737 }, { "epoch": 1.8923076923076922, "grad_norm": 0.1972758173942566, "learning_rate": 4.378016085790885e-06, "loss": 0.9468, "step": 738 }, { "epoch": 1.8948717948717948, "grad_norm": 0.18865923583507538, "learning_rate": 4.324324324324325e-06, "loss": 0.9226, "step": 739 }, { "epoch": 1.8974358974358974, "grad_norm": 0.19400173425674438, "learning_rate": 4.2697547683923715e-06, "loss": 1.1236, "step": 740 }, { "epoch": 1.9, "grad_norm": 0.18145526945590973, "learning_rate": 4.2142857142857145e-06, "loss": 0.8993, "step": 741 }, { "epoch": 1.9025641025641025, "grad_norm": 0.2090071588754654, "learning_rate": 4.157894736842105e-06, "loss": 0.9357, "step": 742 }, { "epoch": 1.905128205128205, "grad_norm": 0.240007683634758, "learning_rate": 4.100558659217877e-06, "loss": 0.9786, "step": 743 }, { "epoch": 1.9076923076923076, "grad_norm": 0.19722330570220947, "learning_rate": 4.04225352112676e-06, "loss": 1.0765, "step": 744 }, { "epoch": 1.9102564102564101, "grad_norm": 0.18485118448734283, "learning_rate": 3.982954545454546e-06, "loss": 0.8595, "step": 745 }, { "epoch": 1.9128205128205127, "grad_norm": 0.2154824435710907, "learning_rate": 3.922636103151863e-06, "loss": 1.0258, "step": 746 }, { "epoch": 1.9153846153846152, "grad_norm": 0.2018478512763977, "learning_rate": 3.861271676300577e-06, "loss": 0.9515, "step": 747 }, { "epoch": 1.9179487179487178, "grad_norm": 0.2598324716091156, "learning_rate": 3.798833819241983e-06, "loss": 1.1186, "step": 748 }, { "epoch": 1.9205128205128204, "grad_norm": 0.21484240889549255, "learning_rate": 3.735294117647058e-06, "loss": 0.9179, "step": 749 }, { "epoch": 1.9230769230769231, "grad_norm": 0.20729656517505646, "learning_rate": 3.6706231454005937e-06, "loss": 0.9008, "step": 750 }, { "epoch": 1.9256410256410257, "grad_norm": 0.19938671588897705, "learning_rate": 3.604790419161677e-06, "loss": 0.9061, "step": 751 }, { "epoch": 1.9282051282051282, "grad_norm": 0.19618763029575348, "learning_rate": 3.5377643504531735e-06, "loss": 0.9478, "step": 752 }, { "epoch": 1.9307692307692308, "grad_norm": 0.20993918180465698, "learning_rate": 3.4695121951219514e-06, "loss": 1.091, "step": 753 }, { "epoch": 1.9333333333333333, "grad_norm": 0.19574132561683655, "learning_rate": 3.4e-06, "loss": 1.0154, "step": 754 }, { "epoch": 1.935897435897436, "grad_norm": 0.2107248604297638, "learning_rate": 3.329192546583851e-06, "loss": 0.997, "step": 755 }, { "epoch": 1.9384615384615385, "grad_norm": 0.19578175246715546, "learning_rate": 3.2570532915360505e-06, "loss": 0.9224, "step": 756 }, { "epoch": 1.941025641025641, "grad_norm": 0.20714713633060455, "learning_rate": 3.183544303797469e-06, "loss": 0.9749, "step": 757 }, { "epoch": 1.9435897435897436, "grad_norm": 0.1808098554611206, "learning_rate": 3.1086261980830674e-06, "loss": 0.898, "step": 758 }, { "epoch": 1.9461538461538461, "grad_norm": 0.20211873948574066, "learning_rate": 3.0322580645161295e-06, "loss": 0.9319, "step": 759 }, { "epoch": 1.9487179487179487, "grad_norm": 0.17889924347400665, "learning_rate": 2.9543973941368082e-06, "loss": 1.0142, "step": 760 }, { "epoch": 1.9512820512820512, "grad_norm": 0.20043864846229553, "learning_rate": 2.875e-06, "loss": 1.0167, "step": 761 }, { "epoch": 1.953846153846154, "grad_norm": 0.18134412169456482, "learning_rate": 2.794019933554818e-06, "loss": 0.8732, "step": 762 }, { "epoch": 1.9564102564102566, "grad_norm": 0.19279873371124268, "learning_rate": 2.7114093959731548e-06, "loss": 0.9885, "step": 763 }, { "epoch": 1.9589743589743591, "grad_norm": 0.1957969218492508, "learning_rate": 2.627118644067797e-06, "loss": 0.9453, "step": 764 }, { "epoch": 1.9615384615384617, "grad_norm": 0.2282707840204239, "learning_rate": 2.5410958904109595e-06, "loss": 0.91, "step": 765 }, { "epoch": 1.9641025641025642, "grad_norm": 0.20508873462677002, "learning_rate": 2.453287197231834e-06, "loss": 0.9894, "step": 766 }, { "epoch": 1.9666666666666668, "grad_norm": 0.19494283199310303, "learning_rate": 2.363636363636364e-06, "loss": 1.0989, "step": 767 }, { "epoch": 1.9692307692307693, "grad_norm": 0.19367046654224396, "learning_rate": 2.2720848056537104e-06, "loss": 1.075, "step": 768 }, { "epoch": 1.971794871794872, "grad_norm": 0.1860765963792801, "learning_rate": 2.1785714285714286e-06, "loss": 0.9745, "step": 769 }, { "epoch": 1.9743589743589745, "grad_norm": 0.1922086477279663, "learning_rate": 2.0830324909747296e-06, "loss": 0.9443, "step": 770 }, { "epoch": 1.976923076923077, "grad_norm": 0.20211626589298248, "learning_rate": 1.9854014598540146e-06, "loss": 0.9371, "step": 771 }, { "epoch": 1.9794871794871796, "grad_norm": 0.21594083309173584, "learning_rate": 1.885608856088561e-06, "loss": 1.0045, "step": 772 }, { "epoch": 1.9820512820512821, "grad_norm": 0.18539482355117798, "learning_rate": 1.7835820895522391e-06, "loss": 0.9609, "step": 773 }, { "epoch": 1.9846153846153847, "grad_norm": 0.19419516623020172, "learning_rate": 1.6792452830188683e-06, "loss": 0.9157, "step": 774 }, { "epoch": 1.9871794871794872, "grad_norm": 0.19369378685951233, "learning_rate": 1.572519083969466e-06, "loss": 0.9668, "step": 775 }, { "epoch": 1.9897435897435898, "grad_norm": 0.19599087536334991, "learning_rate": 1.4633204633204633e-06, "loss": 0.9275, "step": 776 }, { "epoch": 1.9923076923076923, "grad_norm": 0.20087149739265442, "learning_rate": 1.3515625000000002e-06, "loss": 1.057, "step": 777 }, { "epoch": 1.994871794871795, "grad_norm": 0.19752490520477295, "learning_rate": 1.2371541501976286e-06, "loss": 1.0176, "step": 778 }, { "epoch": 1.9974358974358974, "grad_norm": 0.17145206034183502, "learning_rate": 1.12e-06, "loss": 0.982, "step": 779 }, { "epoch": 2.0, "grad_norm": 0.435234934091568, "learning_rate": 1.0000000000000002e-06, "loss": 0.8781, "step": 780 } ], "logging_steps": 1, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 195, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.846660370087018e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }